In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import itertools

import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from datasets import Dataset, DatasetDict, ClassLabel, Value, Features
from transformers import AutoTokenizer
from transformers import TFAutoModel
from transformers import TFAutoModelForSequenceClassification

from sklearn.preprocessing import StandardScaler

from torch.nn.functional import cross_entropy
import umap

In [None]:
tdf = pd.read_csv('Data\\twitter_training.csv')
vdf = pd.read_csv('Data\\twitter_validation.csv')

In [None]:
TopicList = []
Sentimentlist = []
commentstring = ""

for x in ntdf.iterrows():
    TopicList.append(x[1]["Topic"])
    Sentimentlist.append(x[1]["Sentiment"])
    commentstring = commentstring + str(x[1]["Comment"])
    

In [None]:
cTopiclist = Counter(TopicList)
cSentimentlist = Counter(Sentimentlist)
commentwordlist = commentstring.split(" ")
ccommentwordlist = Counter(commentwordlist)

In [None]:
print(cTopiclist.most_common(10))
print(cSentimentlist.most_common(4))
print(ccommentwordlist.most_common(10))
print(len(set(ccommentwordlist)))

In [None]:
def Cleaning(x):
    lemmatizer = WordNetLemmatizer()
    x = str(x)
    na = nltk.regexp_tokenize(x.lower(), r'(\b[\w]{2,}\b)')
    naa = []
    for z in na:
        naa.append(lemmatizer.lemmatize(z))
    nnaa = " ".join(naa)
    return nnaa

def RSW(x):
    x = str(x)
    stop_words = set(stopwords.words('english'))
    word_tokens = nltk.regexp_tokenize(x.lower(), r'(\b[\w]{2,}\b)')
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    nt = " ".join(filtered_sentence)
    return nt

#Creating Lem With Stop
for index, row in tdf.iterrows():
    oa = row["Comment"]
    g = row.name
    tdf.loc[g, "clean_Comment"] = Cleaning(oa)

#Creating No Lem With out Stop
for index, row in tdf.iterrows():
    oa = row["Comment"]
    g = row.name
    tdf.loc[g, "No_Stop_Words_Comment"] = RSW(oa)
#Creating Lem With Out Stop
for index, row in tdf.iterrows():
    oa = row["Comment"]
    g = row.name
    ns = Cleaning(oa)
    nns = RSW(ns)
    tdf.loc[g, "No_Stop_Words_Lemmatized_Comment"] = nns
    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tdf, tdf["Sentiment"], test_size=0.20, random_state=42)

In [None]:
def TheOne(typeofabstract, binary, ngram):
            pipeline = Pipeline([("Vec", CountVectorizer(binary= binary, ngram_range=(1, ngram))), ("Clf", LogisticRegression(max_iter= 3000))])
            scores = cross_validate(pipeline,
                X_train[typeofabstract],
                y_train,
                cv=10, 
                scoring=['f1_macro'])
            pipeline.fit(X_train[typeofabstract], y_train)
            y_true = list(X_test["Comment"])
            y_pred = pipeline.predict(X_test[typeofabstract])
            stufflist = []
            stufflist.append(typeofabstract)
            if binary == True:
                stufflist.append("Binary")
            else:
                stufflist.append("Non-Binary") 
            if ngram == 1:
                stufflist.append("1_ngram") 
            else:
                stufflist.append("2_ngram")
            return metrics.f1_score(y_true, y_pred, average='macro'), scores, stufflist

In [None]:
for index, row in tdf.iterrows():
    if row["Comment"] == float:
        tdf.drop(row)

In [None]:
tdf['Comment'] = tdf['Comment'].astype(str)
tdf['clean_Comment'] = tdf['clean_Comment'].astype(str)
tdf['No_Stop_Words_Comment'] = tdf['No_Stop_Words_Comment'].astype(str)
tdf['No_Stop_Words_Lemmatized_Comment'] = tdf['No_Stop_Words_Lemmatized_Comment'].astype(str)

In [None]:
test_f1_list = []
cv_f1_list = []
cv_f1t_list = []
event_list = []
a, b, c = TheOne("clean_Comment", True, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("1")
a, b, c = TheOne("No_Stop_Words_Comment", True, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("2")
a, b, c = TheOne("No_Stop_Words_Lemmatized_Comment", True, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("3")
a, b, c = TheOne("Comment", True, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("4")
a, b, c = TheOne("clean_Comment", False, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("5")
a, b, c = TheOne("No_Stop_Words_Comment", False, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("6")
a, b, c = TheOne("No_Stop_Words_Lemmatized_Comment", False, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("7")
a, b, c = TheOne("Comment", False, 1)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("8")
a, b, c = TheOne("clean_Comment", True, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("9")
a, b, c = TheOne("No_Stop_Words_Comment", True, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("10")
a, b, c = TheOne("No_Stop_Words_Lemmatized_Comment", True, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("11")
a, b, c = TheOne("Comment", True, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("12")
a, b, c = TheOne("clean_Comment", False, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("13")
a, b, c = TheOne("No_Stop_Words_Comment", False, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("14")
a, b, c = TheOne("No_Stop_Words_Lemmatized_Comment", False, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])
print("15")
a, b, c = TheOne("Comment", False, 2)
test_f1_list.append(a)
cv_f1_list.append(b["test_f1_macro"])
event_list.append(c)
cv_f1t_list.append(b["fit_time"])

In [None]:
ndf = {}
tbl = []
i = 1
for x in range(1, 17):
    ndf[x] = {"Lemmatized" : "", "stop_words" : "", "binarized" : "", "n_gram" : "", "mean_f1" : "", "median_f1" : "", "std_f1" : "", "total_fit_time" : "", "test_f1" : "", "scenario" : ""}
for x in event_list:
    new_list = []
    if "abstract" == x[0]:
        ndf[i]["Lemmatized"] = "Not_Lemmatized"
        ndf[i]["stop_words"] = "Included"
        new_list.append("Not_Lemmatized Stop Words Included")
    elif "clean_abstract" == x[0]:
        ndf[i]["Lemmatized"] = "Is_Lemmatized"
        ndf[i]["stop_words"] = "Included"
        new_list.append("Is_Lemmatized Stop Words Included")
    elif "No_Stop_Words_abstract" == x[0]:
        ndf[i]["Lemmatized"] = "Not_Lemmatized"
        ndf[i]["stop_words"] = "Not_Included"
        new_list.append("Not_Lemmatized Stop Words Not Included")
    else:
        ndf[i]["Lemmatized"] = "Is_Lemmatized"
        ndf[i]["stop_words"] = "Not_Included"
        new_list.append("Is_Lemmatized Stop Words Not Included")
    if x[1] == "Binary":
        ndf[i]["binarized"] = "Yes"
        new_list.append("Is Binarized")
    else:
        ndf[i]["binarized"] = "No"
        new_list.append("Isn't Binarized")
    if x[2] == "1_ngram":
        ndf[i]["n_gram"] = "1"
        new_list.append("Is 1ngram")
    else:
        ndf[i]["n_gram"] = "2"
        new_list.append("Is 2ngram")
    tbl.append(" ".join(new_list))
    ndf[i]["scenario"] = " ".join(new_list)
    i = i + 1
i = 1
for x in test_f1_list:
    ndf[i]["test_f1"] = x
    i = i + 1
i = 1
for x in cv_f1t_list:
    fn = 0
    for y in x:
        fn = y + fn
    ndf[i]["total_fit_time"] = fn
    i = i + 1
i = 1
for x in cv_f1_list:
    ndf[i]["mean_f1"] = np.mean(x)
    i = i + 1
i = 1
for x in cv_f1_list:
    ndf[i]["median_f1"] = np.median(x)
    i = i + 1
i = 1
for x in cv_f1_list:
    ndf[i]["std_f1"] = np.std(x)
    i = i + 1

nndf = pd.DataFrame(ndf)
nndf = pd.DataFrame.transpose(nndf)
display(nndf)
nndf.to_csv('model_card.csv')

In [None]:
data = []
for x in cv_f1_list:
    
    data.append(x)
fig = plt.figure(figsize =(10, 7))
ax = fig.add_subplot(111)

ax.boxplot(data)
ax.set_xticklabels(tbl, rotation=90)
i = 1

plt.xlabel("Scenarios")
plt.ylabel("F1 Score")
plt.title("F1 Scores by scenario")
plt.show()

In [None]:
import dill

In [None]:
def Cleaning(x):
    lemmatizer = WordNetLemmatizer()
    noc = []
    for y in x:
        oa = y
        na = nltk.regexp_tokenize(oa.lower(), r'(\b[\w]{2,}\b)')
        naa = []
        for z in na:
            naa.append(lemmatizer.lemmatize(z))
        nnaa = " ".join(naa)
        noc.append(nnaa)
    return noc

pipeline = Pipeline([("Func", FunctionTransformer(Cleaning)),("Vec", CountVectorizer(ngram_range=(1, 2))), ("Clf", LogisticRegression())])
pipeline.fit(X_train['Comment'], y_train)
dill.settings['recurse'] = True
dill.dump(pipeline, open('model.pkl','wb'))

In [None]:
test_strings = ["The Department of Veterans Affairs (VA) proposes to revise its regulations to", 'health health health health', "dog cat lamb"]
model_saved = dill.load(open('model.pkl','rb'))
y_pred = model_saved.predict(test_strings)
y_pred