### Loading/Splitting Data

In [39]:
import csv
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import pandas as pd


X_txt_train= []
y_train = []

# Loading data from CSVs.

# 1. Load the training datasets into two lists (X_txt_train will be a list of strings; y_train)
file_train = open("train.tsv",  encoding="utf8")

reader_train = csv.reader(file_train,delimiter="\t")

for row in reader_train:
    y = row[2]
    X_txt = row[1]
    X_txt_train.append(X_txt)
    y_train.append(y)
    
# 2. Load the test datasets into two lists (X_txt_test will be a list of strings; y_test)    
file_test = open("test.tsv", encoding="utf8")
reader_test = csv.reader(file_test,delimiter="\t", quoting=csv.QUOTE_NONE)

test_data = []

for row in reader_test:
    test_data.append(row)  

X_txt_train, X_txt_test, y_train, y_test = train_test_split(X_txt_train, y_train, test_size=0.2, random_state=42)

### Feature Engineer

In [40]:
class LexiconClassifier():
    def __init__(self):
            """
                Initalize the Lexicon classifer by loading lexicons.
            """
            self.offensive_language = set()
            with open('bad-words.txt', encoding = 'utf-8') as iFile:
                for row in iFile:
                    self.offensive_language.add(row.strip())
            self.target = set()
            with open('names.txt', encoding='utf-8') as iFile:
                for row in iFile: # add if statement here to break up the Classifiers???
                    self.target.add(row.lower().strip())
                    

    def predict(self, tweet):
            """
                Returns a prediction for a string in tweet.
                Returns:
                pred -- a string ("TIN", "OFF", "NOT")  # this is for the classifier portion
            """
           
                    
            num_offensive_words = 0 # OFF
            num_targeted_offensive_words = 0  # TIN
            cap_words = 0
            
            for word in tweet.split():
                if word.lower() in self.offensive_language:
                    num_offensive_words += 1
                if word.lower() in self.target :
                    num_targeted_offensive_words +=1
                if word.isupper()== True and word not in ["@USER", "NOT","TIN","OFF","I"]:
                    cap_words +=1
            pred = "NOT"        
            #If tweet contains offensive words and target words, label as 'TIN'
            if num_offensive_words >0 and num_targeted_offensive_words >0:
                pred = "TIN"
            #If there's an offensive word, label it offensive
            elif num_offensive_words > 0:
                pred = 'OFF'
            return pred
        
    def count_offensive(self, tweet):
        
            num_offensive_words = 0
            for word in tweet.split():
                if word.lower() in self.offensive_language:
                    num_offensive_words += 1
                    
            return num_offensive_words

    def count_target(self, tweet):
        
            num_targeted_offensive_words = 0
            for word in tweet.split():
                if word.lower() in self.target:
                    num_targeted_offensive_words += 1
                    
            return num_targeted_offensive_words

    def count_cap(self, tweet):
        
            num_cap_words = 0
            for word in tweet.split():
                if word.isupper()== True and self not in ["@USER", "NOT","TIN","OFF","I"]:
                    num_cap_words +=1
            return num_cap_words
                    


### Score with rule based Features

In [41]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Instatiate that class
myclfr = LexiconClassifier()
lex_test_preds = []

# Loop over X_txt_test
#    for each string in X_txt_test (i.e., for each item in the list), pass it to LexiconClassifiers .predict() method
#    append the prediction to lex_test_preds
for i in X_txt_test:
    lex_test_preds.append(myclfr.predict(i))


precision = precision_score(lex_test_preds, y_test, average = 'micro') # Get scores using lex_test_preds and y_test with the precision_score method
recall = recall_score(lex_test_preds, y_test, average = 'micro') # Get scores using lex_test_preds and y_test with the recall_score method
f1 = f1_score(lex_test_preds, y_test, average = 'micro')# Get scores using lex_test_preds and y_test with the f1_score method

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1: {:.4f}".format(f1))

Precision: 0.6588
Recall: 0.6588
F1: 0.6588


### Score with CountVectorizer features

In [4]:
pipe = Pipeline([('vec', CountVectorizer()),
                ('clf', LinearSVC(random_state = 42))])


params = {"clf__C": [0.01, 0.1, 1.],
         "vec__ngram_range": [(1,1),(1,2)]}

clf = GridSearchCV(pipe,params, cv=2, scoring = 'f1_micro')

clf.fit(X_txt_train, y_train)

Score = clf.predict(X_txt_test)

print('Best Score', clf.best_score_)
print('Best Params:', clf.best_params_)



Best Score 0.7220587586358138
Best Params: {'clf__C': 0.1, 'vec__ngram_range': (1, 1)}




### Score with both(stacked) rule based features and CountVectorizer 

In [42]:
X_train_lexicon_features = [] # Initailze to an empty list. This will be a list of lists
X_test_lexicon_features = [] #  Initailze to an empty list. This will be a list of lists

for i in X_txt_train:
    X_train_lexicon_features.append([myclfr.count_offensive(i), myclfr.count_target(i),myclfr.count_cap(i)])

for i in X_txt_test:
    X_test_lexicon_features.append([myclfr.count_offensive(i), myclfr.count_target(i),myclfr.count_cap(i)])

In [43]:
import scipy.sparse as sp
from scipy.sparse import hstack, coo_matrix
import numpy as np
np.random.seed(42)
import random
random.seed(42)

vec = CountVectorizer(ngram_range =(1,1))

X_train_w_lex = vec.fit_transform(X_txt_train) # This will be the matrix from CountVectorizer (X_txt_train)
X_test_w_lex = vec.transform(X_txt_test)

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_test_lexicon_features = np.array(X_test_lexicon_features)

X_train_w_lex = hstack([X_train_lexicon_features,X_train_w_lex]).toarray()
X_test_w_lex = hstack([X_test_lexicon_features, X_test_w_lex]).toarray()


SVC = LinearSVC()
params = {'C':[0.01, 0.1, 1.]}
clf = GridSearchCV(SVC, params, cv = 5)
clf.fit(X_train_w_lex, y_train)
lexicon_pred = clf.predict(X_test_w_lex)
print('Score with Lexicon', clf.best_score_)



Score with Lexicon 0.7310293835957552




In [46]:
## creating dataframe to view test-from-train results

df_ruleCountVec = pd.DataFrame({"tweet" : X_txt_test, "actual" : y_test})
df_ruleCountVec['predictions'] = lexicon_pred.reshape(-1,1)

In [47]:
df_ruleCountVec[:10]

Unnamed: 0,tweet,actual,predictions
0,@USER @USER -its quite obvious the morons at...,TIN,TIN
1,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,NOT
2,@USER i’m weak 😂😂.. Shit must have been really...,TIN,TIN
3,@USER Who the hell does @USER think he is to s...,NOT,TIN
4,@USER Just because his best stats season was a...,TIN,NOT
5,58 factory workers massacred in Salt Lake City...,NOT,NOT
6,@USER Oh my heart! She is beautiful. MUCH more...,NOT,NOT
7,@USER URL And this explains why he is disresp...,TIN,TIN
8,@USER Trump will blame it on the immigrants fo...,NOT,TIN
9,@USER She is so proud,NOT,NOT


In [48]:
## creating dataset for false positives and negatives
df_ruleCountVec_FALSE = df_ruleCountVec[~(df_ruleCountVec['actual']==df_ruleCountVec['predictions'])]

In [49]:
df_ruleCountVec_FALSE[:10]

Unnamed: 0,tweet,actual,predictions
3,@USER Who the hell does @USER think he is to s...,NOT,TIN
4,@USER Just because his best stats season was a...,TIN,NOT
8,@USER Trump will blame it on the immigrants fo...,NOT,TIN
14,@USER No prison time for you. You're going to ...,TIN,NOT
22,@USER Look at this chill as fuck bearded drago...,UNT,TIN
30,@USER He is still eating and talking about p**...,TIN,NOT
32,@USER Dear Paul McFartney- You had us at Beat...,TIN,NOT
33,@USER Nanos Poll .........Liberals 41.1 Conser...,NOT,TIN
34,@USER Maybe because you’ve provided a play for...,TIN,NOT
36,@USER @USER @USER @USER @USER @USER @USER @USE...,TIN,NOT


### Applying to Test Data

In [50]:
# changing test_data into pd Dataframe for easy changes
td = pd.DataFrame(test_data, columns=['id', 'tweet','pred'])

In [51]:
test_data_lexicon_features = [] # Initailze to an empty list. This will be a list of lists

for i in td['tweet']: # this orignally was reading 'test_data[1]' which is the row, now this reads whole 'tweet' column
    test_data_lexicon_features.append([myclfr.count_offensive(i), myclfr.count_target(i),myclfr.count_cap(i)])
    

In [52]:
test_data_lexicon_features[:10] # checking to see the lexicon counts for each tweet

[[0, 8, 2],
 [1, 7, 1],
 [1, 2, 7],
 [0, 2, 3],
 [0, 2, 1],
 [1, 7, 2],
 [0, 14, 4],
 [1, 4, 2],
 [0, 1, 1],
 [0, 5, 5]]

In [53]:
import scipy.sparse as sp
from scipy.sparse import hstack, coo_matrix
import numpy as np
np.random.seed(42)
import random
random.seed(42)


test_data_w_lex = vec.transform(td['tweet']) #changed to read tweet column instead of 'test_data[1] row'

test_data_lexicon_features = np.array(test_data_lexicon_features)


test_data_w_lex = hstack([test_data_lexicon_features,test_data_w_lex]).toarray()

SVC = LinearSVC()
params = {'C':[0.01, 0.1, 1.]}
clf = GridSearchCV(SVC, params, cv = 5)
clf.fit(X_train_w_lex, y_train)
test_data_pred = clf.predict(test_data_w_lex)
print('Score with Lexicon', clf.best_score_) # Not sure if we need this since it's referring to the best
                                             # score/Lexicon obtianed from training data  



Score with Lexicon 0.7310293835957552




In [54]:
test_data_pred # checking the array of predictions

array(['NOT', 'TIN', 'NOT', ..., 'NOT', 'TIN', 'NOT'], dtype='<U3')

In [55]:
td['prediction']=test_data_pred.reshape(-1,1) #adding new 'prediction' array and reshaping into column 

In [56]:
td[:25] #checking test data 

Unnamed: 0,id,tweet,pred,prediction
0,41567,@USER Nancy Lee Grahn You Are Awesome! I have ...,NOT,NOT
1,19123,@USER She is a Skrull. Enemy of The Kree. The ...,NOT,TIN
2,79672,@USER @USER @USER @USER @USER @USER @USER Exce...,NOT,NOT
3,29055,@USER @USER @USER You are so beautiful♡,NOT,NOT
4,32479,@USER This is what happens when liberals get i...,NOT,NOT
5,42594,@USER @USER Daniels said her job does not refl...,NOT,TIN
6,95697,"@USER No longer on guard, Marie smiles warmly....",NOT,NOT
7,96084,@USER Gun control is omportant. It should not...,NOT,NOT
8,77548,@USER Antifa girl of the month centrefold!,NOT,NOT
9,62926,@USER @USER @USER @USER Tweet is directed at h...,NOT,NOT


In [78]:
print(td[10:15])

       id                                              tweet pred prediction
10  53633  @USER Oh well yes goes without saying you are ...  NOT        NOT
11  14182                                 @USER Good!!! MAGA  NOT        NOT
12  49199  @USER #AmazonPets  This is bonnie she is 2 yea...  NOT        NOT
13  35683  /63 More evidence Liberals only goal is to sab...  NOT        TIN
14  43602  @USER @USER She’s a class act isn’t she. Kim t...  NOT        NOT


In [None]:
len(test_data_w_lex)
len(test_data)

print(test_data_w_lex)
print(test_data)

In [None]:
len(X_train_w_lex)
print(y_train)

In [114]:
from sklearn.metrics import confusion_matrix
import numpy as np

Prediction = []
Actual = []
for row in test_data:
    Actual = row[2]
    #Prediction.append(Actual)
    #y_train.append(y)
for row in td:
    Prediction = row[3]
    
    
#confusion_matrix(Actual, y_train)

IndexError: string index out of range

In [121]:
type(test_data_pred)

numpy.ndarray

In [123]:
new = np.array(Actual)

In [126]:
print(new)

NOT


In [107]:
len(Actual)
len(y_train)

8473

In [109]:
type(X_test_w_lex)
print(X_test_w_lex)
type(y_train)
print(len(y_train))
type(X_train_w_lex)
print(len(X_train_w_lex))
type(test_data_w_lex)
print(len(test_data_w_lex))

[[3 6 2 ... 0 0 0]
 [0 5 9 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 ...
 [2 1 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 2 3 ... 0 0 0]]
8473
8473
2648


In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]], dtype=int64)

In [110]:
print(X_train_lexicon_features)

[[ 0 10  3]
 [ 0  0  1]
 [ 1  5  1]
 ...
 [ 0  2  2]
 [ 0  2  2]
 [ 0  0  2]]
