<a href="https://colab.research.google.com/github/AR13570/MiniProject/blob/main/MiniProjDescAns_Init.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ARNAV**

## Importing Libraries

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import re
import string
import nltk
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

!pip --quiet install -U sentence-transformers
from sentence_transformers import SentenceTransformer,CrossEncoder
import pandas as pd

!pip install transformers

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM

!python -m spacy download en_core_web_lg
import spacy

import itertools
import pprint

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, htt

## PreProcessing - preprocess_text(text)
### Not needed for contexual models

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


## Cosine-Similarity Function - cos_sim(emb1,emb2)
### Calculates cos-sim between 2 columns of embeddings

In [None]:
def cos_sim(sentence1_emb, sentence2_emb):
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

## Load the Dataset - DataFrame df
### Consider only the expected answer and student's answer
### (needs to be extended to include allotted scores - data currently not available)

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/AR13570/MiniProject/main/Data%20sets.csv")
df = df[df.columns[[3,4,5]]]
df.columns=['Key','Ans','Marks']

In [None]:
df.head()

Unnamed: 0,Key,Ans,Marks
0,Object-oriented programming (OOP) is a compute...,"An OOP is a modular approach, which allows dat...",4
1,Object-oriented programming (OOP) is a compute...,An opps is a modular approach which allows dat...,4
2,Object-oriented programming (OOP) is a compute...,A programming language structure where in the ...,5
3,Object-oriented programming (OOP) is a compute...,Object Oriented Programming involves programmi...,5
4,Object-oriented programming (OOP) is a compute...,Object Oriented Programming approach organizes...,5


## The USE model
### use_eval(answer, expected_answer)

In [None]:
def use_eval(answer, expected_answer,df):
    #load the model
    use_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    use_model = hub.load(use_model_url)

    # Encode the answer and the expected answer using USE
    # This results in embeddings for the list of ans-expected_ans pairs
    answer_vector = use_model(answer).numpy()
    expected_answer_vector = use_model(expected_answer).numpy()
  
    # Compute the semantic similarity using cosine similarity
    similarity = cos_sim(answer_vector, expected_answer_vector)

    # Store the Scores in a separate column
    df['USE_Sim_Scores'] = similarity

## S-BERT CrossEncoder
### sbert_cross(answer, expected_answer)

In [None]:
def sbert_cross(answer, expected_answer,df):
    # Load the model
    sbert_cross_model = CrossEncoder('cross-encoder/stsb-roberta-base')
    # Create a list of <ans, expected_ans> pairs
    sentence_pairs = []
    for sentence1, sentence2 in zip(answer, expected_answer):
        sentence_pairs.append([sentence1, sentence2])

    # Cross encoder doesn't return embeddings and instead directly returns score
    similarity = sbert_cross_model.predict(sentence_pairs)

    # Store the Scores in a separate column
    df['Similarity'] = similarity

## S-BERT BiEncoder
### sbert_bi(answer, expected_answer):

In [None]:
def sbert_bi(answer, expected_answer,df):
    # Load the pre-trained model
    sbert_bi_model = SentenceTransformer('stsb-mpnet-base-v2')

    # Generate Embeddings
    answer_vector = sbert_bi_model.encode(answer)
    expected_answer_vector = sbert_bi_model.encode(expected_answer)

    # Compute the semantic similarity using cosine similarity
    similarity = cos_sim(answer_vector, expected_answer_vector)

    # Store the Scores in a separate column
    df['SBERT_BiEncoder_Sim_Scores'] = similarity

## Run all models

Running on cpu and gpu to test performance

In [None]:
def cpu():
    with tf.device('/cpu:0'):
        sbert_cross(df['Ans'], df['Key'],df)
def gpu():
    device_name = tf.test.gpu_device_name()
    if device_name == '/device:GPU:0':
        with tf.device('/device:GPU:0'):
            sbert_cross(df['Ans'], df['Key'],df)
    else:
        print("No gpu access")

In [None]:
import timeit
cpu()
gpu()
print('CPU (s):',end=" ")
cpu_time = timeit.timeit('cpu()', number=1, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):',end=" ")
gpu_time = timeit.timeit('gpu()', number=1, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Downloading (…)lve/main/config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

CPU (s): 3.593869256000005
GPU (s): 4.551360982000006
GPU speedup over CPU: 0x


In [None]:
df.head(10)

Unnamed: 0,Key,Ans,Marks,Similarity
0,Object-oriented programming (OOP) is a compute...,"An OOP is a modular approach, which allows dat...",4,0.548109
1,Object-oriented programming (OOP) is a compute...,An opps is a modular approach which allows dat...,4,0.373515
2,Object-oriented programming (OOP) is a compute...,A programming language structure where in the ...,5,0.503911
3,Object-oriented programming (OOP) is a compute...,Object Oriented Programming involves programmi...,5,0.612272
4,Object-oriented programming (OOP) is a compute...,Object Oriented Programming approach organizes...,5,0.677486
5,Object-oriented programming (OOP) is a compute...,object-oriented programming is a programming p...,5,0.619386
6,Object-oriented programming (OOP) is a compute...,The object oriented programming is the basic c...,4,0.473006
7,Object-oriented programming (OOP) is a compute...,Objet oriented Programming - It is defined as ...,5,0.520034
8,Object-oriented programming (OOP) is a compute...,Object oriented Pocagramming is based on the c...,6,0.568977
9,Object-oriented programming (OOP) is a compute...,Object oriented programming is a programming p...,4,0.597188


## Show the similarities and rank the models

In [None]:
# score_cols = [col for col in df.columns if '_Scores' in col]
# eval_df = df[score_cols].corr(method='spearman').iloc[1:, 0:]*100
# eval_df.head(10)

# **PRATHAM**

## Import the saved model and testing it





For running on our local machine(even CPU), we can just import the saved model and play with it


In [None]:
obtained_model_bert_uncased=pipeline("text-classification",model="abdulmatinomotoso/English_Grammar_Checker")
obtained_model_roberta=pipeline("text-classification",model="imohammad12/GRS-Grammar-Checker-DeBerta")

Downloading (…)lve/main/config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/557M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("abdulmatinomotoso/English_Grammar_Checker")
model = AutoModelForMaskedLM.from_pretrained("abdulmatinomotoso/English_Grammar_Checker")

Some weights of the model checkpoint at abdulmatinomotoso/English_Grammar_Checker were not used when initializing BertForMaskedLM: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at abdulmatinomotoso/English_Grammar_Checker and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should prob

## RoBERTa

In [None]:
def roberta(df):
  answers=[]
  for sentence in df['Ans']:  
    o=obtained_model_roberta(sentence)
    ans=o[0]["score"]
    answers.append(ans)
  #print(answers)
  df['Grammer'] = answers

## Loading Dataset

In [None]:
roberta(df)

#**HAARISH**

## Loading spacy

In [None]:
nlp = spacy.load("en_core_web_lg")

## Loading data into variables

In [None]:
stud_ans=df['Ans'].tolist()
key=df['Key'].tolist()
key=key[0]
ans_doc=nlp(key)

## List of Keywords

In [None]:
#@title List of Keywords

def extract_POS(sample_doc):
    res=[]
    for chk in sample_doc.noun_chunks:
        tmp=""
        for tkn in chk:
            if (tkn.pos_ in ['NOUN','PROPN','ADJ'] ):
                if (not(tkn.is_stop) and not(tkn.is_punct)):
                    tmp = tmp + tkn.text.lower() + " "
        if(tmp.strip()!=""):
            res.append(tmp.strip())
    return list(dict.fromkeys(res))

key_POS=extract_POS(ans_doc)

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([key])
candidate_embeddings = model.encode(key_POS)

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## Cosine Similarity 

In [None]:
top_n = 7
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [key_POS[index] for index in distances.argsort()[0][-top_n:]]



## Matching keywords

In [None]:
def matching_keywords(stdlst,keylst):
        #matched list
        res=[]
        #unmatched list
        tmpres=[]
        for x in stdlst:
            if (x in keylst):
                res.append(x)
        return res



## Assign Weights to Keywords

In [None]:
def dictionary_with_weights(words):

# nouns are given weightage as 10
# proper nouns are given weightage as 8.5
# adjectives are given weightage as 5

    categorized_words = {'9': [], '8.5': [], '5': []}
    for word in words:
        doc = nlp(word)
        pos = doc[0].pos_
        if pos in ['NOUN', 'PRON']:
            categorized_words['9'].append(word)
        elif pos == 'PROPN':
            categorized_words['8.5'].append(word)
        elif pos == 'ADJ':
            categorized_words['5'].append(word)
    return categorized_words

keywords_scores = dictionary_with_weights(keywords)
print(keywords_scores)


{'9': ['oop', 'functions', 'logic', 'object programming', 'software design', 'computer programming model'], '8.5': ['data'], '5': []}


## Final Scoring

In [None]:
total=0
count=0
df['Keyword'] = None
for i in keywords_scores['9']:
  count+=1
total=total+count*9

count=0
for i in keywords_scores['8.5']:
  count+=1
total=total+count*8.5

count=0
for i in keywords_scores['5']:
  count+=1
total=total+count*5


count=1
for stud_doc in stud_ans:
  if(type(stud_doc)==float):
    stud_doc=str(stud_doc)
  stud_doc=nlp(stud_doc) 
  std_POS=extract_POS(stud_doc)
#then apply match function
  matched=matching_keywords(std_POS,keywords)
  stud_score=0
#percentage of matched keywords along with weights
  for i in matched:
    if i in keywords_scores['9']:
      stud_score=stud_score+9
    elif i in keywords_scores['8.5']:
      stud_score=stud_score+8.5
    elif i in keywords_scores['5']:
      stud_score=stud_score+5

  print("Evaluation of student:",count,"'s response:")
  print("Matching percentage with keywords:",len(matched)/len(keywords))
  print("Relative score using weights:",stud_score/total)
  score=stud_score/total
  df['Keyword'][count-1] = score
  print("Matched Special POS keywords:", matched)
  print("\n")
  count+=1

Evaluation of student: 1 's response:
Matching percentage with keywords: 0.2857142857142857
Relative score using weights: 0.28
Matched Special POS keywords: ['oop', 'data']


Evaluation of student: 2 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 3 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 4 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 5 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 6 's response:
Matching percentage with keywords: 0.2857142857142857
Relative score using weights: 0.28
Matched Special PO

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Keyword'][count-1] = score


Evaluation of student: 14 's response:
Matching percentage with keywords: 0.2857142857142857
Relative score using weights: 0.28
Matched Special POS keywords: ['oop', 'data']


Evaluation of student: 15 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 16 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 17 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['data']


Evaluation of student: 18 's response:
Matching percentage with keywords: 0.0
Relative score using weights: 0.0
Matched Special POS keywords: []


Evaluation of student: 19 's response:
Matching percentage with keywords: 0.14285714285714285
Relative score using weights: 0.136
Matched Special POS keywords: ['da

# Final DataFrame

In [None]:
df.head(20)


Unnamed: 0,Key,Ans,Marks,Similarity,Grammer,Keyword
0,Object-oriented programming (OOP) is a compute...,"An OOP is a modular approach, which allows dat...",4,0.548109,0.784988,0.28
1,Object-oriented programming (OOP) is a compute...,An opps is a modular approach which allows dat...,4,0.373515,0.70949,0.136
2,Object-oriented programming (OOP) is a compute...,A programming language structure where in the ...,5,0.503911,0.933529,0.136
3,Object-oriented programming (OOP) is a compute...,Object Oriented Programming involves programmi...,5,0.612272,0.962539,0.136
4,Object-oriented programming (OOP) is a compute...,Object Oriented Programming approach organizes...,5,0.677486,0.668948,0.136
5,Object-oriented programming (OOP) is a compute...,object-oriented programming is a programming p...,5,0.619386,0.940307,0.28
6,Object-oriented programming (OOP) is a compute...,The object oriented programming is the basic c...,4,0.473006,0.935807,0.144
7,Object-oriented programming (OOP) is a compute...,Objet oriented Programming - It is defined as ...,5,0.520034,0.872376,0.136
8,Object-oriented programming (OOP) is a compute...,Object oriented Pocagramming is based on the c...,6,0.568977,0.521357,0.136
9,Object-oriented programming (OOP) is a compute...,Object oriented programming is a programming p...,4,0.597188,0.989838,0.136


In [None]:
df.to_csv('MiniProj_dataset_with_metrics.csv')

In [None]:
df.head()

Unnamed: 0,Key,Ans,Marks,Similarity,Grammer,Keyword
0,Object-oriented programming (OOP) is a compute...,"An OOP is a modular approach, which allows dat...",4,0.548109,0.784988,0.28
1,Object-oriented programming (OOP) is a compute...,An opps is a modular approach which allows dat...,4,0.373515,0.70949,0.136
2,Object-oriented programming (OOP) is a compute...,A programming language structure where in the ...,5,0.503911,0.933529,0.136
3,Object-oriented programming (OOP) is a compute...,Object Oriented Programming involves programmi...,5,0.612272,0.962539,0.136
4,Object-oriented programming (OOP) is a compute...,Object Oriented Programming approach organizes...,5,0.677486,0.668948,0.136


# Linear Regression

In [None]:
import pandas as pd
from sklearn import linear_model

df = pd.read_csv('https://raw.githubusercontent.com/AR13570/MiniProject/main/MiniProj_dataset_with_metrics.csv')
x = df[['Similarity','Grammer','Keyword']].values.tolist()
y = df['Marks'].values.tolist()

regr = linear_model.LinearRegression()
regr.fit(x, y)

import pickle
  
# Save the trained model as a pickle string.
pickle.dump(regr,open("regression.pickle",'wb'))
  
# Load the pickled model
saved_model = pickle.load(open("regression.pickle",'rb'))
  
# Use the loaded pickled model to make predictions
saved_model.predict(df[['Similarity','Grammer','Keyword']].iloc[:1,:])



array([4.92058169])

# Using Regression Model from saved file

In [None]:
import pickle
import numpy as np

def finalMarks(li):
    loaded_model = pickle.load(open("regression.pickle",'rb')) 
    x=loaded_model.predict(li)
    np.floor(x)
    x=x.clip(0,7)
    x=int(x)
    return x

In [None]:
finalMarks([[1,1,1]])

7