In [2]:
import pandas as pd
import numpy as np

In [3]:
#reading datasets
valid_data = pd.read_csv("Hope_ENG_dev.csv")
train_data = pd.read_csv("Hope_ENG_train.csv")

In [4]:
#adding column labels
valid_data.columns =['text', 'label']
train_data.columns =['text', 'label']

In [5]:
valid_data.head()
train_data.head()

Unnamed: 0,text,label
0,@Champions Again He got killed for using false...,Non_hope_speech
1,It's not that all lives don't matter,Non_hope_speech
2,Is it really that difficult to understand? Bla...,Non_hope_speech
3,Whenever we say black isn't that racists? Why...,Non_hope_speech
4,Ros The Boss u don’t know that she’s actually ...,Non_hope_speech


In [6]:
training_data = train_data.sample(frac=0.8, random_state=25)
testing_data = train_data.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 33680
No. of testing examples: 8420


In [7]:
#printing our datasets
print(valid_data)
print(training_data)
print(testing_data)

                                                   text            label
0     @Generation X Counting money that she been giv...  Non_hope_speech
1     @Paola Hernandez i never said to be intolerant...  Non_hope_speech
2     @Firstlast300 Wow An opinion is that I don't l...  Non_hope_speech
3     WOW!!!!!!!That was so so inspiring and incredi...      Hope_speech
4     @FALC0n  Yea sorry I know Asian is an ethnicit...  Non_hope_speech
...                                                 ...              ...
5011  i’m actually about to start my college on civi...      Hope_speech
5012  @Sasha Dumse that is true. But we should ALL l...      Hope_speech
5013                        Women need to keep fighting      Hope_speech
5014  “God gave me a choice and my choice is love” t...      Hope_speech
5015  why is there no footage of the riots and the v...      Hope_speech

[5016 rows x 2 columns]
                                                    text            label
8480   I think you're awe

In [8]:
#encoding our data to 0 and 1
training_data['enc_label'] = training_data['label'].replace({'Non_hope_speech':0, 'Hope_speech':1})
valid_data['enc_label'] = valid_data['label'].replace({'Non_hope_speech':0, 'Hope_speech':1})

In [9]:
#removing usernames
import re

def remove_usernames_links(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    return tweet
training_data['text'] = training_data['text'].apply(remove_usernames_links)

#cleaning text

import nltk
import ssl

'''try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()'''

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

   

training_data['cleanText']= training_data['text'].map(lambda s:preprocess(s)) 

In [10]:
training_data

Unnamed: 0,text,label,enc_label,cleanText
8480,I think you're awesome! You tell it like it i...,Non_hope_speech,0,think awesome tell like matter color need join...
1186,Juan you are racist not these guys.,Non_hope_speech,0,juan racist guys
7818,I don't want Trump or Clinton as president eit...,Non_hope_speech,0,want trump clinton president either nni hate w...
24959,I am a female engineer and this made me cry. T...,Hope_speech,1,female engineer made cry woman awesome inspiri...
16745,Maya* Nope... Just an American that's tired o...,Non_hope_speech,0,maya nope american tired ridicule mythical whi...
...,...,...,...,...
37930,Madonna has been an advocate from day dot. She...,Hope_speech,1,madonna advocate day dot fought hard lgbt peop...
25966,Peace be upon you to our beloved brothers and ...,Hope_speech,1,peace upon beloved brothers dearest sisters
27917,All lives matter to God.,Hope_speech,1,lives matter god
36087,I loved the kids' reactions and comments. I ...,Hope_speech,1,loved kids reactions comments hope next genera...


In [11]:
df = training_data
df = df.drop(['text', 'label'], axis = 1)
df

Unnamed: 0,enc_label,cleanText
8480,0,think awesome tell like matter color need join...
1186,0,juan racist guys
7818,0,want trump clinton president either nni hate w...
24959,1,female engineer made cry woman awesome inspiri...
16745,0,maya nope american tired ridicule mythical whi...
...,...,...
37930,1,madonna advocate day dot fought hard lgbt peop...
25966,1,peace upon beloved brothers dearest sisters
27917,1,lives matter god
36087,1,loved kids reactions comments hope next genera...


In [12]:
X = df.cleanText
y = df.enc_label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain_vects = vectorizer.fit_transform(X_train)
Xtest_vects = vectorizer.transform(X_test)


In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=48).fit(Xtrain_vects, y_train)

In [15]:
score = clf.score(Xtrain_vects, y_train)
score2 = clf.score(Xtest_vects, y_test)
print("Accuracy on Training Data :",score)
print("Accuracy on Testing Data :",score2)
from sklearn.metrics import f1_score, recall_score, precision_score
normalscore=f1_score(y_test, clf.predict(Xtest_vects), average = None)
weightedscore =f1_score(y_test, clf.predict(Xtest_vects), average = 'weighted')
macroscore= f1_score(y_test, clf.predict(Xtest_vects), average = 'macro')
microrecall = recall_score(y_test, clf.predict(Xtest_vects), average='micro')
macrorecall = recall_score(y_test, clf.predict(Xtest_vects), average='macro')
normalrecall = recall_score(y_test, clf.predict(Xtest_vects), average=None)
weightedrecall = recall_score(y_test, clf.predict(Xtest_vects), average='weighted')
macroprecisionscore= precision_score(y_test, clf.predict(Xtest_vects), average='macro')
weightedprecisionscore= precision_score(y_test, clf.predict(Xtest_vects), average='weighted')
microprecisionscore= precision_score(y_test, clf.predict(Xtest_vects), average='micro')
normalprecisionscore= precision_score(y_test, clf.predict(Xtest_vects), average=None)
print('The normal f1score :',normalscore,'\nThe weighted f1score :',weightedscore,'\nThe macro f1score :',macroscore)
print('The microrecall score :',microrecall,'\nThe macrorecall score :',macroscore,'\nThe weightedrecall score :',weightedrecall,'\nThe normalrecall score :',normalrecall)
print('The micro precision score :',microprecisionscore,'\nThe macro precision score :',macroprecisionscore,'\nThe weighted precision score :',weightedprecisionscore,'\nThe normal precision score :',normalprecisionscore)


Accuracy on Training Data : 0.9955463182897862
Accuracy on Testing Data : 0.9744655581947743
The normal f1score : [0.97340754 0.9754426 ] 
The weighted f1score : 0.9744401803757773 
The macro f1score : 0.9744250745311364
The microrecall score : 0.9744655581947743 
The macrorecall score : 0.9744250745311364 
The weightedrecall score : 0.9744655581947743 
The normalrecall score : [0.94876432 0.99941486]
The micro precision score : 0.9744655581947743 
The macro precision score : 0.9759792491080834 
The weighted precision score : 0.9756320722931042 
The normal precision score : [0.99936508 0.95259342]


In [16]:
import pickle
pickle_out = open('model.pkl', 'wb')
pickle.dump(clf, pickle_out)
pickle_out.close()

In [25]:
import joblib
joblib.dump(clf,r"../NLP-Research-SRM/model.pkl")

['../NLP-Research-SRM/model.pkl']