In [1]:
from sqlalchemy import create_engine
from db_config import *

# utilities
import re
import pickle
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# nltk
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import plot_roc_curve


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report


MNB = pickle.load(open('/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Classification/Code/saved_models/Sentiment-MNB.pickle', 'rb'))
LR = pickle.load(open('/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Classification/Code/saved_models/Sentiment-LR.pickle', 'rb'))
SVM = pickle.load(open('/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Classification/Code/saved_models/Sentiment-SVC.pickle', 'rb'))
vectorisor = pickle.load(open('/Users/connormcdonald/Desktop/Masters/MIT807/Gartner Repository/Classification/Code/saved_models/vectoriser-ngram-(1,2).pickle', 'rb'))

In [2]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
          
## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [3]:
def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [4]:
table = 'social.five_g_only'
classifier_name = 'MNB'
# start_date = '2021-01-01'
# end_date = '2021-12-31'


engine = create_engine('mysql+mysqlconnector://'+ user + ':' + passwd + '@' + ip + ':3306/' + schema1)
df = pd.read_sql(f'SELECT * FROM {table} ', engine)

In [5]:
import time
text = df['text']
t = time.time()
processedtext = preprocess(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

Text Preprocessing complete.
Time Taken: 271 seconds


In [6]:
X  = vectorisor.transform(processedtext)

In [7]:
MNB_pred = MNB.predict(X)
MNB_prob = MNB.predict_proba(X)

LR_pred = LR.predict(X)
LR_prob = LR.predict_proba(X)

SVM_pred = SVM.predict(X)
SVM_prob = SVM.predict_proba(X)

In [8]:
a = LR_pred
b = SVM_pred
c = MNB_pred

d = LR_prob
e = SVM_prob
f = MNB_prob

ensemble_pred = []
ensemble_prob = []

for i in range(len(a)):
    temp = []
    temp2 = []

    temp.append(a[i])
    temp.append(b[i])
    temp.append(c[i])
    ensemble_pred.append(max(set(temp), key=temp.count))

    temp2.append(max(d[i]))
    temp2.append(max(e[i]))
    temp2.append(max(f[i]))
    ensemble_prob.append(np.mean(temp2))

In [9]:
mnb_model_prob = []
for i in range(len(MNB_pred)):
    if MNB_pred[i] == 1:
        mnb_model_prob.append(max(MNB_prob[i]))
    else:
        mnb_model_prob.append(1 - max(MNB_prob[i]))


lr_model_prob = []
for i in range(len(LR_pred)):
    if LR_pred[i] == 1:
        lr_model_prob.append(max(LR_prob[i]))
    else:
        lr_model_prob.append(1 - max(LR_prob[i]))


svm_model_prob = []
for i in range(len(SVM_pred)):
    if SVM_pred[i] == 1:
        svm_model_prob.append(max(SVM_prob[i]))
    else:
        svm_model_prob.append(1 - max(SVM_prob[i]))


In [10]:
fiveg_sentiment = df.iloc[:,[0]]
fiveg_sentiment['mnb_prediction'] = MNB_pred
fiveg_sentiment['mnb_probability'] = np.array(mnb_model_prob)

fiveg_sentiment['lr_prediction'] = LR_pred
fiveg_sentiment['lr_probability'] = np.array(lr_model_prob)


fiveg_sentiment['svm_prediction'] = SVM_pred
fiveg_sentiment['svm_probability'] = np.array(svm_model_prob)


fiveg_sentiment['ensemble_prediction'] = np.array(ensemble_pred)
fiveg_sentiment['ensemble_probability'] = np.array(ensemble_prob)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fiveg_sentiment['mnb_prediction'] = MNB_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fiveg_sentiment['mnb_probability'] = np.array(mnb_model_prob)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fiveg_sentiment['lr_prediction'] = LR_pred
A value is trying to be set on a copy of a slice from a

In [11]:
fiveg_sentiment

Unnamed: 0,id,mnb_prediction,mnb_probability,lr_prediction,lr_probability,svm_prediction,svm_probability,ensemble_prediction,ensemble_probability
0,8429189120,0,0.389555,0,0.437693,1,0.530554,0,0.567769
1,8426974156,1,0.711811,1,0.597386,1,0.564564,1,0.624587
2,8425178074,0,0.433470,1,0.553662,1,0.583830,1,0.568007
3,8423781651,1,0.663098,1,0.688232,1,0.578114,1,0.643148
4,8422294950,1,0.708029,1,0.784254,1,0.578925,1,0.690403
...,...,...,...,...,...,...,...,...,...
4840192,1100449732676075520,1,0.621337,1,0.504261,1,0.563637,1,0.563078
4840193,1100449720823173121,1,0.743817,1,0.884691,1,0.552437,1,0.726982
4840194,1100449707745148928,1,0.559057,1,0.772396,1,0.575902,1,0.635785
4840195,1100449699671277569,0,0.420618,0,0.292846,1,0.583830,0,0.623455


In [12]:
fiveg_sentiment.to_sql('fiveG_sentiment' , con=engine, if_exists='append', index=False, chunksize=20000)