In [14]:
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy import random
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

from myStopwords import stopword
from clean_file import *

%matplotlib inline

In [18]:
import re
import nltk
import emoji
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def remove_punctuations(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_digits(text):
    text = ''.join([digit for digit in text if not digit.isdigit()])
    return text

def remove_emojis(text):
    text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return text

def remove_stopwords(text):
    text = ' '.join(word for word in text.split() if word not in stopword)
    return text

def clean_text(text):
    text = text.replace("&#39;", "'")
    text = text.replace("&quot;", "")
    text = text.replace("&amp;", "")
    text = text.replace("‘", "'")
    text = text.replace("’", "'")
    text = text.replace("“", "'")
    text = text.replace("”", "'")
    text = text.replace(" – ", " ")
    text = text.replace("—", " ")
    text = text.replace("•", " ")
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'bit.ly/\S+', '', text)
    text = re.sub('(RT\s@[A-Za-z0-9-_]+[A-Za-z0-9-_]: +)', '', text)
    text = re.sub('(@[A-Za-z0-9-_]+[A-Za-z0-9-_]+)', '', text)
    text = BeautifulSoup(text, "lxml").text 
    text = text.lower()
    text = remove_punctuations(text)
    text = remove_digits(text)
    text = remove_emojis(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_text(text):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text))  
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:            
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lilin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lilin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lilin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = pd.read_excel('training_1.xlsx')
df = df[['text', 'label']]
df.columns = ['text', 'label']
df.head(5)

In [None]:
dff = df.loc[df['label'] != 0]
dff['clean_text'] = dff['text'].apply(clean_text)
dff = dff.dropna(subset= ['clean_text'])
dff.head(5)

In [15]:
labels = ['1', '-1']

In [None]:
plt.figure(figsize=(10,4))
dff['label'].value_counts().plot(kind='bar')

In [16]:
df1 = pd.read_excel('training_2.xlsx')
print(len(df1))
df1.head(5)

19504


Unnamed: 0,text,label
0,reopen res publica,1
1,reopen country,1
2,country,1
3,country reopen,1
4,reopen nation country,1


In [5]:
def filter_text(text):
    return None if len(text.split()) < 2 else text

print('volume before filter', len(df1))
#df1['text'] = df1['text'].apply(filter_text)
#df1 = df1.dropna(subset=['text'])
#print('volume after filter', len(df1))

volume before filter 19504


In [19]:
df2 = pd.read_excel('testing_3.xlsx')
df2['clean_text'] = df2['text'].apply(clean_text)
df2.head(5)

Unnamed: 0,text,label,clean_text
0,RT @TheDemocrats: We can’t know when it’s safe...,-1,cant know safe reopen widespread testing
1,RT @CBSNews: Dr. Fauci on guidelines to reopen...,-1,dr fauci guideline reopen us economy program n...
2,RT @thehill: Seattle Mayor says she does not b...,-1,seattle mayor say not believe city state ameri...
3,RT @JohnCornyn: Opinion: We can gradually reop...,1,opinion gradually reopen texan part
4,RT @dougducey: Thank you @POTUS &amp; @SecBern...,1,thank work reopen americas national park az pl...


In [20]:
X_train = df1['text']
y_train = df1['label']
X_test = df2['clean_text']
y_test = df2['label']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 1. Multinomial Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_trad = nb.predict(X_train)
y_pred = nb.predict(X_test)

print("train accuracy: %.2f%%" % (accuracy_score(y_trad, y_train)*100))
print("test accuracy: %.2f%%" % (accuracy_score(y_pred, y_test)*100))
print(classification_report(y_test, y_pred,target_names=labels))

train accuracy: 93.88%
test accuracy: 72.81%
              precision    recall  f1-score   support

           1       0.81      0.73      0.77      1013
          -1       0.63      0.73      0.68       642

    accuracy                           0.73      1655
   macro avg       0.72      0.73      0.72      1655
weighted avg       0.74      0.73      0.73      1655



### 2. Linear Support Vector with SDG

In [22]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='log', penalty='l2',alpha=1e-4, random_state=45, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_trad = sgd.predict(X_train)
y_pred = sgd.predict(X_test)

print("train accuracy: %.2f%%" % (accuracy_score(y_trad, y_train)*100))
print("test accuracy: %.2f%%" % (accuracy_score(y_pred, y_test)*100))
print(classification_report(y_test, y_pred,target_names=labels))

train accuracy: 93.37%
test accuracy: 72.51%
              precision    recall  f1-score   support

           1       0.81      0.72      0.76      1013
          -1       0.62      0.73      0.67       642

    accuracy                           0.73      1655
   macro avg       0.72      0.73      0.72      1655
weighted avg       0.74      0.73      0.73      1655



### 3. Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

lr = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(solver='lbfgs', C=1e5, max_iter=3000)),
               ])
lr.fit(X_train, y_train)

y_trad = lr.predict(X_train)
y_pred = lr.predict(X_test)

print("train accuracy: %.2f%%" % (accuracy_score(y_trad, y_train)*100))
print("test accuracy: %.2f%%" % (accuracy_score(y_pred, y_test)*100))
print(classification_report(y_test, y_pred,target_names=labels))

train accuracy: 99.97%
test accuracy: 67.85%
              precision    recall  f1-score   support

           1       0.77      0.68      0.72      1013
          -1       0.57      0.67      0.62       642

    accuracy                           0.68      1655
   macro avg       0.67      0.68      0.67      1655
weighted avg       0.69      0.68      0.68      1655



In [11]:
import os
import glob

def label_data(text, label):
    return nb.predict([text])[0] if pd.isnull(label) else label

def label_datafile(filepath, dfs):
    dft = pd.read_excel(filepath)
    dft = clean_file(dft, dfs)
    dft = dft.loc[dft['lang']=='en']
    df = pd.merge(dft, dfs, on='text', how="left")
    df['clean_text'] = df.apply(lambda row: clean_text(str(row['text'])), axis = 1)
    df['label'] = df.apply(lambda row: label_data(str(row['clean_text']), row['label']), axis = 1)
    df = df[['user', 'verified', 'location', 'state', 'followers', 'time', 'text', 'lang', 'label', 'sentiment',
             'rt_user', 'rt_verified', 'rt_location', 'rt_followers', 'rt_time', 'rt_lang']]
    filename = str(filepath).split('\\')[-1].split('_sentiment')[0] + '_label.xlsx'
    write_excel(df, filename, save_directory)
    print('-------- finish writing file:', filename, '--------')
        
def write_excel(df, filename, save_directory):
    directory = save_directory + '/'
    os.makedirs(os.path.dirname(directory), exist_ok=True)
    output_file = os.path.join(directory, filename)      
    writer = pd.ExcelWriter(output_file, engine='xlsxwriter', options={'strings_to_urls': False})
    df.to_excel(writer, index=False)
    writer.save()
    
def read_folder(read_directory):
    files = glob.glob(read_directory + "/*.xlsx")
    for file in files:
        label_datafile(file, dfs)

In [12]:
dfs = pd.read_excel('samples.xlsx')
dfs.head(5)

Unnamed: 0,text,label
0,RT @lydiathegreat_: So Texas has closed school...,-1
1,RT @realDonaldTrump: REOPEN OUR COUNTRY!,1
2,RT @keithedwards: America getting ready to reo...,-1
3,RT @miaxmon: new zealand were fast to act and ...,-1
4,RT @realDonaldTrump: Many States moving to SAF...,1


In [13]:
read_directory = "reopen_clean"
save_directory = "reopen_final"
read_folder(read_directory)

----------start to clean file-----------
the number of tweet before filtering: 132205
the number of tweet after removing: 118496
the number of tweet after cleaning: 110071
the number of tweet after filtering: 95853
-------- finish writing file: reopen_2020-04-17_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 427941
the number of tweet after removing: 380576
the number of tweet after cleaning: 353238


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 305343
-------- finish writing file: reopen_2020-04-18_to_2020-04-24_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 165441
the number of tweet after removing: 150177
the number of tweet after cleaning: 136342


  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 110578
-------- finish writing file: reopen_2020-04-25_to_2020-04-28_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 235624
the number of tweet after removing: 221072
the number of tweet after cleaning: 192970


  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 156782
-------- finish writing file: reopen_2020-04-29_to_2020-05-03_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 51319
the number of tweet after removing: 43357
the number of tweet after cleaning: 39282
the number of tweet after filtering: 30578
-------- finish writing file: reopen_2020-05-04_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 58847
the number of tweet after removing: 50712
the number of tweet after cleaning: 46333
the number of tweet after filtering: 36218
-------- finish writing file: reopen_2020-05-05_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 66359
the number of tweet after removing: 58638
the number of tweet after cleaning: 51959
the number of tweet after filtering: 40717
-------- finish writing file: reopen_2020-05-06_label.xlsx --------
----------start to clean file-----

  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 33472
-------- finish writing file: reopen_2020-05-08_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 87290
the number of tweet after removing: 78977
the number of tweet after cleaning: 73292
the number of tweet after filtering: 62286
-------- finish writing file: reopen_2020-05-09_to_2020-05-10_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 147224
the number of tweet after removing: 133040
the number of tweet after cleaning: 117854
the number of tweet after filtering: 92344
-------- finish writing file: reopen_2020-05-11_to_2020-05-12_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 158116
the number of tweet after removing: 132077
the number of tweet after cleaning: 118770
the number of tweet after filtering: 94823
-------- finish writing file: reopen_2020-05-13_to_2020-05-14_label.xlsx --------
-

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 104124
-------- finish writing file: reopen_2020-05-17_to_2020-05-18_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 122494
the number of tweet after removing: 108242
the number of tweet after cleaning: 100434


  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 80999
-------- finish writing file: reopen_2020-05-19_to_2020-05-20_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 162253
the number of tweet after removing: 139496
the number of tweet after cleaning: 122591


  ' Beautiful Soup.' % markup)


the number of tweet after filtering: 96835
-------- finish writing file: reopen_2020-05-21_to_2020-05-23_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 97704
the number of tweet after removing: 88289
the number of tweet after cleaning: 76751
the number of tweet after filtering: 59325
-------- finish writing file: reopen_2020-05-24_to_2020-05-26_label.xlsx --------
----------start to clean file-----------
the number of tweet before filtering: 118936
the number of tweet after removing: 105129
the number of tweet after cleaning: 95795
the number of tweet after filtering: 66558
-------- finish writing file: reopen_2020-05-27_to_2020-05-30_label.xlsx --------
