In [4]:
# VERY GOOD TUTORIAL: https://towardsdatascience.com/introduction-to-nlp-part-4-supervised-text-classification-model-in-python-96e9709b4267
import nltk
import csv
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('movie_reviews')

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erice\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\erice\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erice\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\erice\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [5]:
#import stuff ans set rules
import pandas as pd
import numpy as np

from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression


pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 2)
pd.set_option('display.width', 10)


In [6]:
#read csv and binary encode the containsPersuasion col
sample = pd.read_csv('persuasionExamples.csv', encoding = "latin1", engine='python', usecols=['body', 'containsPersuasion'])
#sample = pd.DataFrame(temp, columns=['body', 'containsPersuasion'])
sample.head()

sample['containsPersuasion'] = np.where(sample['containsPersuasion']=='[1]', 1, 0)
sample['containsPersuasion'].value_counts()

1    49240
0    47467
Name: containsPersuasion, dtype: int64

In [7]:
#break data into a training and validation set, .70 of the data is to train, .30 is to validate
X_train, X_test, y_train, y_test = train_test_split(sample['body'], sample['containsPersuasion'], test_size=0.3, random_state=123)

print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')


print(y_train.value_counts()) #34k comments to train, 14k to test
print(y_test.value_counts())

Train dimensions: ((67694,), (67694,))
Test dimensions: ((29013,), (29013,))
1    34360
0    33334
Name: containsPersuasion, dtype: int64
1    14880
0    14133
Name: containsPersuasion, dtype: int64


In [8]:
#preprocessing data
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    porter = PorterStemmer()
    lemmas = [porter.stem(token.lower()) for token in tokens]
    return lemmas

In [9]:
#Testing the preprocessing
print(sample.iloc[2]['body'])
var = preprocess_text(sample.iloc[2]['body'])
print(var)

> I believe that moving forward, we should push for much higher taxes on those earning/worth more than 1 billion USD.

How do you propose that this is done? You should note that billionaires do not earn billions, they own things that are worth billions.
['i', 'believ', 'that', 'move', 'forward', 'we', 'should', 'push', 'for', 'much', 'higher', 'tax', 'on', 'those', 'earn', 'worth', 'more', 'than', '1', 'billion', 'usd', 'how', 'do', 'you', 'propos', 'that', 'thi', 'is', 'done', 'you', 'should', 'note', 'that', 'billionair', 'do', 'not', 'earn', 'billion', 'they', 'own', 'thing', 'that', 'are', 'worth', 'billion']


In [10]:
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train.values.astype('U'))
X_train_tfidf.shape

(67694, 41723)

In [11]:
#Modeling!!!
sgd_clf = LogisticRegression(solver='lbfgs', max_iter=120000)#SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5)
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

[0.89511781 0.8940099  0.8985154  0.89452692 0.89097356]
Accuracy: 0.89 (+/- 0.00)


In [12]:
pipe = Pipeline([('vectoriser', vectoriser),
                 ('classifier', sgd_clf)])
pipe.fit(X_train.values.astype('U'), y_train.values.astype('U'))

Pipeline(steps=[('vectoriser',
                 TfidfVectorizer(analyzer=<function preprocess_text at 0x000002A8F49E0D30>)),
                ('classifier', LogisticRegression(max_iter=120000))])

In [13]:
y_test_pred = pipe.predict(X_test.values.astype('U'))
print("Accuracy: %0.2f" % (accuracy_score(y_test.values.astype('U'), y_test_pred)))
print(confusion_matrix(y_test.values.astype('U'), y_test_pred))

Accuracy: 0.90
[[12904  1229]
 [ 1683 13197]]


In [18]:
#Making a test set of unlabled data
#rows of csv
comment_bodies = []

#colls of csv
details = ['body']

myTests = ["this is a statement.", "you should not go to college", "those speakers are loud",
           "Since i like vanilla, you should get vanilla ice cream instead of chocolate", "kpop is short for korean pop",
           "please dont listen to kpop because it is bad", "you should come to my party this weekend", 
           "Coke was invented in atlanta georiga by John Stith Pemberton", 
           "If you are from atlanta, it is important to drink coke and not pepsi",
           "the bears are a really good nfl team", "if youre coming to my party, you better be wearing some bears merch",
          "please dont listen to kpop because it is bad. please dont listen to kpop because it is bad. please dont listen to kpop because it is bad. please dont listen to kpop because it is bad. please dont listen to kpop because it is bad. please dont listen to kpop because it is bad"]

#for writing to the csv file
with open('testSet.csv', 'a') as f:
    write = csv.writer(f)
    try:
        write.writerow(details)
    except:
        print("woop woop") #used to catch an error (i think caused by untypable characters (emojis))
    for test in myTests:
        curr_test = test
        arr = [curr_test]
        try:
            write.writerow(arr) #appends comment to csv file
        except:
            print("woop woop") #same as other one
print("Done!")

Done!


In [19]:
man_test = pd.read_csv('testSet.csv', encoding = "latin1", engine='python', usecols=['body'])
#sample = pd.DataFrame(temp, columns=['body', 'containsPersuasion'])
man_test.head()
man_X_test = man_test['body']
print(man_X_test)

0                                  this is a statement.
1                          you should not go to college
2                               those speakers are loud
3     Since i like vanilla, you should get vanilla i...
4                          kpop is short for korean pop
                            ...                        
7     Coke was invented in atlanta georiga by John S...
8     If you are from atlanta, it is important to dr...
9                  the bears are a really good nfl team
10    if youre coming to my party, you better be wea...
11    please dont listen to kpop because it is bad. ...
Name: body, Length: 12, dtype: object


In [20]:
predictions = pipe.predict(man_X_test.values.astype('U'))
count = 0
for i in predictions:
    print(man_X_test.iloc[count] + ": " + predictions[count])
    count += 1

this is a statement.: 0
you should not go to college: 1
those speakers are loud: 0
Since i like vanilla, you should get vanilla ice cream instead of chocolate: 0
kpop is short for korean pop: 0
you should come to my party this weekend: 0
Coke was invented in atlanta georiga by John Stith Pemberton: 0
If you are from atlanta, it is important to drink coke and not pepsi: 1
the bears are a really good nfl team: 0
if youre coming to my party, you better be wearing some bears merch: 0
