In [50]:
# ! pip install tensorflow
# ! pip install transformers
# ! pip install torch
# ! pip install pytorch

# import torch
# print(torch.__version__)
# conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cpuonly -c pytorch

In [69]:
import re
import logging
import numpy as np
import pandas as pd
import multiprocessing

from re import sub
from time import time 
from unidecode import unidecode
from gensim.models import Word2Vec
from collections import defaultdict
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.models.phrases import Phrases, Phraser

from gensim.models import Word2Vec
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [52]:
def import_data():
    
    df = pd.read_excel(r'sample_feedbacks.xlsx')
    df.target.replace([4,2], 1, inplace = True)
    df.target.replace(0, -1, inplace = True)

    df['length'] = [len(x) for x in df.feedbacks]

    df.rename({'feedbacks': 'description', 'target': 'rate'}, axis=1, inplace=True)
    
    df.to_csv('student_feebacks.csv', index=False)

In [53]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    # text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text  

In [55]:
def data_preprocessing():
    
    file = pd.read_csv("student_feebacks.csv")
    
    file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'description':'title'})
    file_cleaned = file_cleaned[file_cleaned.rate!=0]
    
    file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))
    file_model = file_cleaned.copy()
    file_model = file_model[file_model.title.str.len()>1]
    
    sent = [row for row in file_model.title]
    phrases = Phrases(sent, min_count=1, progress_per=50000)
    bigram = Phraser(phrases)
    sentences = bigram[sent]
    
    w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

    start = time()

    w2v_model.build_vocab(sentences, progress_per=50000)

    print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))
    
    start = time()

    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

    print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

    w2v_model.init_sims(replace=True)
    
    w2v_model.save("word2vec.model")
    
    file_export = file_model.copy()
    file_export['old_title'] = file_export.title
    file_export.old_title = file_export.old_title.str.join(' ')
    file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
    file_export.rate = file_export.rate.astype('int8')
    
    file_export[['title', 'rate']].to_csv('cleaned_dataset.csv', index=False)

In [56]:
def training_kmeans():

    word_vectors = Word2Vec.load("word2vec.model").wv

    model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

    positive_cluster_index = 1
    positive_cluster_center = model.cluster_centers_[positive_cluster_index]
    negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

    words = pd.DataFrame(word_vectors.index_to_key)
    words.columns = ['words']
    words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
    words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
    words.cluster = words.cluster.apply(lambda x: x[0])

    words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
    words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
    words['sentiment_coeff'] = words.closeness_score * words.cluster_value

    words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [57]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [58]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [67]:
def get_predictions():
    
    final_file = pd.read_csv('cleaned_dataset.csv')

    sentiment_map = pd.read_csv('sentiment_dictionary.csv')
    sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

    file_weighting = final_file.copy()

    tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
    tfidf.fit(file_weighting.title)
    features = pd.Series(tfidf.get_feature_names())
    transformed = tfidf.transform(file_weighting.title)

    replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

    replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

    replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
    replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
    replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
    replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
    replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

    predicted_classes = replacement_df.prediction
    y_test = replacement_df.sentiment

    conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
    # print('Confusion Matrix')
    # display(conf_matrix)

    test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

    # print('\n \n Scores')
    scores = pd.DataFrame(data=[test_scores])
    scores.columns = ['accuracy', 'precision', 'recall', 'f1']
    scores = scores.T
    scores.columns = ['scores']
    # display(scores)
    
    print(replacement_df[replacement_df['prediction'] == 0]['sentence'].reset_index(drop=True)[59])
    print('-----------------------------------------------')
    print(replacement_df)

In [61]:
def prescriptive_analytics():

    # Load the GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = TFGPT2LMHeadModel.from_pretrained('gpt2')

    # Load the student feedback dataset
    feedbacks = [
        "I'm not able to pay my university fees",
        "Course outline is not well defined",
        "Teachers are very good",
        "Teachers are not collaborative",
        "teaching style is not good"
    ]

    # Generate text for each feedback in the dataset
    for feedback in feedbacks:
        # Encode the input prompt as a sequence of tokens
        prompt = "What is the solution for " + feedback
        input_ids = tf.constant(tokenizer.encode(prompt, return_tensors='tf'))

        # Generate text with the GPT-2 model
        outputs = model.generate(input_ids)
        text = tokenizer.decode(outputs[0].numpy(), skip_special_tokens=True)
        print(text)


In [68]:
# import_data()
# data_preprocessing()
# training_kmeans()
get_predictions()

the examination system of the university is very impressive the sitting plan of he students is made in such a way that no 2 students of same class give the exam in same room or even sometimes same blocks through this students also get to_know and visit different blocks of the university the checking is also fair
-----------------------------------------------
                                       sentiment_coeff  \
0    [-1.0319097960186612, 0, 1.036606611948171, 0,...   
1    [-1.035325828604643, -1.0220210938944394, 1.01...   
2    [-1.0391542186665983, 0, -1.027270156328658, -...   
3    [1.036643297408026, -1.041125775113667, 1.0110...   
4    [-1.0281609632960544, 1.044951808199921, 1.036...   
..                                                 ...   
119  [0, 1.044951808199921, 1.024796561358173, 0, 0...   
120  [1.0171321511527047, 0, 0, 1.044951808199921, ...   
121  [-1.0655187303207156, 0, 0, -1.039767932814708...   
122  [1.0150974417255805, 1.045943319235544, 0, -1....   




In [70]:
import panel as pn

def test_function(value1, value2):
    print("The value entered in the first textbox is: ", value1)
    return "Shaka laka boom boom"

text_input1 = pn.widgets.TextAreaInput(name="Enter Student's Feedback:", width=500, height=200)
text_input2 = pn.widgets.TextAreaInput(name="Enter Teacher's Feedback:", width=500, height=200)
submit_button1 = pn.widgets.Button(name="Submit Student Feedback", button_type="primary")
submit_button2 = pn.widgets.Button(name="Submit Teachers Feedback", button_type="primary")
feedback_display = pn.widgets.StaticText(value="", width=500, background='lightgrey')

@submit_button1.on_click
def submit_clicked1(event):
    value1 = text_input1.value
    feedback = test_function(value1, "")
    feedback_display.value = feedback

@submit_button2.on_click
def submit_clicked2(event):
    value2 = text_input2.value
    test_function("", value2)

pn.Column(
pn.Row(text_input1),
pn.Row(submit_button1),
pn.Row(feedback_display),
pn.Row(""),
pn.Row(text_input2),
pn.Row(submit_button2)
).show(view='popup')

INFO - 16:03:48: Starting Bokeh server version 2.4.2 (running on Tornado 6.1)
INFO - 16:03:48: User authentication hooks NOT provided (default user enabled)


Launching server at http://localhost:60697


<bokeh.server.server.Server at 0x1b2a9113670>

INFO - 16:03:49: 200 GET / (::1) 194.32ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/alerts.css (::1) 8.97ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/markdown.css (::1) 5.86ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/card.css (::1) 13.18ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/dataframe.css (::1) 15.96ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/debugger.css (::1) 20.95ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/json.css (::1) 23.94ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/loading.css (::1) 26.74ms
INFO - 16:03:49: 200 GET /static/extensions/panel/css/widgets.css (::1) 3.99ms
INFO - 16:03:49: 200 GET /static/js/bokeh-gl.min.js?v=863c26b3d7cbcf2a0dbf119589404b3ca66734754cd0af1d6e6ca17679ae711126917f171667194a6f04765eba06d9eb2d7d1f2ba7ef8fee420b9244557386f8 (::1) 7.98ms
INFO - 16:03:49: 200 GET /static/extensions/panel/panel.min.js?v=d4fabbf73758512562f5c3b6a3b77456c94ac424b7443f9aee6c61b7919cc2d