In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt                  # Importing required libraries
%matplotlib inline
import os
import sklearn
import warnings
warnings.filterwarnings("ignore")

#### Importing Dataset

In [3]:
os.chdir("C:/Users/ABC/Desktop/Afgan Forum paper in R/afghanForum")
data=pd.read_csv("afghanForum.csv",error_bad_lines=False)
data.head()

b'Skipping line 38554: expected 12 fields, saw 16\nSkipping line 53649: expected 12 fields, saw 17\nSkipping line 54548: expected 12 fields, saw 20\n'


Unnamed: 0,MessageID,ThreadID,ThreadName,Date_Time,MemberID,MemberName,Message,P_Year,P_Month,P_Day,P_Date,ThreadFirstMessageID
0,1000015,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 17:32:00,339,Paradox\t\t\t\t\t\t\t\t\t\t\t,I don't like celebrity news but this needed to...,2009,3,14,2009-03-14 17:32:00.000,1000015
1,1000038,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 18:08:00,240,Filament\t\t\t\t\t\t\t\t\t\t\t,"Ew! ""Have courage for the great sorrows of lif...",2009,3,14,2009-03-14 18:08:00.000,1000015
2,1000069,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 19:15:00,988,~*~JiLk@Y~*~\t\t\t\t\t\t\t\t\t\t\t,"what? I am a traveler seeking the truth, a hum...",2009,3,14,2009-03-14 19:15:00.000,1000015
3,1000099,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 22:15:00,339,Paradox\t\t\t\t\t\t\t\t\t\t\t,Well it is as good a time as any to throw the ...,2009,3,14,2009-03-14 22:15:00.000,1000015
4,1000132,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 22:38:00,240,Filament\t\t\t\t\t\t\t\t\t\t\t,"Ew X infinity. ""Have courage for the great sor...",2009,3,14,2009-03-14 22:38:00.000,1000015


In [4]:
dat=data[["MessageID","Message"]]
dat.shape

(88032, 2)

#### Sentiment analysis of message column in data

In [5]:
from textblob import TextBlob
import re
import textblob

In [6]:
def clean_text(text): 
        ''' 
        Utility function to clean text by removing links,
        special characters using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()) 

def get_text_sentiment(text):
    ''' 
    Utility function to classify sentiment of passed
    text using textblob's sentiment method 
    '''
    # create TextBlob object of passed text 
    analysis = TextBlob(clean_text(text)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'    

In [7]:
sentiment = []
for message in dat['Message']:
    sentiment.append(get_text_sentiment(message))
    
dat["Sentiment"] = sentiment
dat.head()

Unnamed: 0,MessageID,Message,Sentiment
0,1000015,I don't like celebrity news but this needed to...,positive
1,1000038,"Ew! ""Have courage for the great sorrows of lif...",positive
2,1000069,"what? I am a traveler seeking the truth, a hum...",positive
3,1000099,Well it is as good a time as any to throw the ...,positive
4,1000132,"Ew X infinity. ""Have courage for the great sor...",positive


In [8]:
dat['Sentiment_Score'] = dat.Sentiment.map({'positive':1, 'negative':-1,'neutral':0})

In [9]:
dat.head()

Unnamed: 0,MessageID,Message,Sentiment,Sentiment_Score
0,1000015,I don't like celebrity news but this needed to...,positive,1
1,1000038,"Ew! ""Have courage for the great sorrows of lif...",positive,1
2,1000069,"what? I am a traveler seeking the truth, a hum...",positive,1
3,1000099,Well it is as good a time as any to throw the ...,positive,1
4,1000132,"Ew X infinity. ""Have courage for the great sor...",positive,1


#### Converting text in Message column to lower case

In [10]:
dat['Message'] = [entry.lower() for entry in dat['Message']]
dat.head()

Unnamed: 0,MessageID,Message,Sentiment,Sentiment_Score
0,1000015,i don't like celebrity news but this needed to...,positive,1
1,1000038,"ew! ""have courage for the great sorrows of lif...",positive,1
2,1000069,"what? i am a traveler seeking the truth, a hum...",positive,1
3,1000099,well it is as good a time as any to throw the ...,positive,1
4,1000132,"ew x infinity. ""have courage for the great sor...",positive,1


#### Removing punctuation and doing word tokenization of text data

In [11]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
dat['Message']= [tokenizer.tokenize(entry) for entry in dat['Message']]

In [12]:
dat['Message']=[" ".join(content) for content in dat['Message'].values]

#### Removing stop words

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [14]:
dat['Message']=dat['Message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [15]:
dat.head()

Unnamed: 0,MessageID,Message,Sentiment,Sentiment_Score
0,1000015,like celebrity news needed told news link lett...,positive,1
1,1000038,ew courage great sorrows life patience small o...,positive,1
2,1000069,traveler seeking truth human searching meaning...,positive,1
3,1000099,well good time throw offer paypal account reas...,positive,1
4,1000132,ew x infinity courage great sorrows life patie...,positive,1


#### Model Building

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [17]:
Y=dat["Sentiment_Score"]

#### Splitting data in train and test in 70:30

In [18]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(dat['Message'],Y,test_size=0.3)

In [19]:
print(Train_X.shape,Test_X.shape,Train_Y.shape,Test_Y.shape)

(61622,) (26410,) (61622,) (26410,)


#### TFIDF Vectorizer of train and test data

In [20]:
Tfidf_vect = TfidfVectorizer(stop_words='english')
Train_X_Tfidf = Tfidf_vect.fit_transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#### Count vectorizer of train and test data

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(Train_X) 

# Transform the test set 
count_test = count_vectorizer.transform(Test_X)

In [22]:
print(Train_X_Tfidf.shape,Test_X_Tfidf.shape,count_train.shape,count_test.shape)

(61622, 117175) (26410, 117175) (61622, 117175) (26410, 117175)


#### Random forest classifier on count vectorizer data

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rf.fit(count_train,Train_Y)
predictions_rf = rf.predict(count_test)

print("Random Forest Accuracy Score -> ",accuracy_score(predictions_rf, Test_Y)*100)

Random Forest Accuracy Score ->  66.90647482014388


In [24]:
confusion_matrix(Test_Y, predictions_rf)

array([[    0,     0,  4328],
       [    0,     0,  4412],
       [    0,     0, 17670]], dtype=int64)

#### Random forest classifier on TFIDF vectorizer data

In [27]:
rf1= RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rf1.fit(Train_X_Tfidf,Train_Y)
predictions_rf1 = rf1.predict(Test_X_Tfidf)

print("Random Forest Accuracy Score -> ",accuracy_score(predictions_rf1, Test_Y)*100)

Random Forest Accuracy Score ->  66.90647482014388


In [28]:
confusion_matrix(Test_Y, predictions_rf1)

array([[    0,     0,  4328],
       [    0,     0,  4412],
       [    0,     0, 17670]], dtype=int64)

####  Decision Tree classifier on count vectorizer data

In [29]:
import sklearn.tree as tree
dec_tree=tree.DecisionTreeClassifier(max_depth=3,random_state=0)
dec_tree.fit(count_train,Train_Y)
predictions_tree = dec_tree.predict(count_test)

print("Decision tree Accuracy Score -> ",accuracy_score(predictions_tree, Test_Y)*100)

Decision tree Accuracy Score ->  66.94055282090117


In [30]:
print(classification_report(Test_Y, predictions_tree))

              precision    recall  f1-score   support

          -1       0.60      0.01      0.01      4328
           0       0.00      0.00      0.00      4412
           1       0.67      1.00      0.80     17670

   micro avg       0.67      0.67      0.67     26410
   macro avg       0.42      0.34      0.27     26410
weighted avg       0.55      0.67      0.54     26410



In [31]:
confusion_matrix(Test_Y, predictions_tree)

array([[   26,     0,  4302],
       [    0,     0,  4412],
       [   17,     0, 17653]], dtype=int64)

####  Decision Tree classifier on TFIDF vectorizer data

In [50]:
dec_tree1=tree.DecisionTreeClassifier(max_depth=3,random_state=0)
dec_tree1.fit(Train_X_Tfidf,Train_Y)
predictions_tree1 = dec_tree1.predict(Test_X_Tfidf)

print("Decision tree Accuracy Score -> ",accuracy_score(predictions_tree1, Test_Y)*100)

Decision tree Accuracy Score ->  67.16395304808785


In [55]:
print(classification_report(Test_Y, predictions_tree1))

              precision    recall  f1-score   support

          -1       0.52      0.01      0.01      4278
           0       0.00      0.00      0.00      4397
           1       0.67      1.00      0.80     17735

   micro avg       0.67      0.67      0.67     26410
   macro avg       0.40      0.33      0.27     26410
weighted avg       0.54      0.67      0.54     26410



In [56]:
confusion_matrix(Test_Y, predictions_tree1)

array([[   25,     0,  4253],
       [    1,     0,  4396],
       [   22,     0, 17713]], dtype=int64)