In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt                  # Importing required libraries
%matplotlib inline
import os
import sklearn
import warnings
warnings.filterwarnings("ignore")

#### Importing dataset

In [2]:
os.chdir("C:/Users/ABC/Desktop/Afgan Forum paper in R/afghanForum")
data=pd.read_csv("afghanForum.csv",error_bad_lines=False)
data.head()

b'Skipping line 38554: expected 12 fields, saw 16\nSkipping line 53649: expected 12 fields, saw 17\nSkipping line 54548: expected 12 fields, saw 20\n'


Unnamed: 0,MessageID,ThreadID,ThreadName,Date_Time,MemberID,MemberName,Message,P_Year,P_Month,P_Day,P_Date,ThreadFirstMessageID
0,1000015,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 17:32:00,339,Paradox\t\t\t\t\t\t\t\t\t\t\t,I don't like celebrity news but this needed to...,2009,3,14,2009-03-14 17:32:00.000,1000015
1,1000038,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 18:08:00,240,Filament\t\t\t\t\t\t\t\t\t\t\t,"Ew! ""Have courage for the great sorrows of lif...",2009,3,14,2009-03-14 18:08:00.000,1000015
2,1000069,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 19:15:00,988,~*~JiLk@Y~*~\t\t\t\t\t\t\t\t\t\t\t,"what? I am a traveler seeking the truth, a hum...",2009,3,14,2009-03-14 19:15:00.000,1000015
3,1000099,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 22:15:00,339,Paradox\t\t\t\t\t\t\t\t\t\t\t,Well it is as good a time as any to throw the ...,2009,3,14,2009-03-14 22:15:00.000,1000015
4,1000132,5536,Love Animals? Like George Clooney?Are hungry?,2009-03-14 22:38:00,240,Filament\t\t\t\t\t\t\t\t\t\t\t,"Ew X infinity. ""Have courage for the great sor...",2009,3,14,2009-03-14 22:38:00.000,1000015


In [3]:
dat=data[["MessageID","Message"]]
dat.shape

(88032, 2)

#### Sentiment Analysis

In [4]:
from textblob import TextBlob
import re
import textblob

In [5]:
def clean_text(text): 
        ''' 
        Utility function to clean text by removing links,
        special characters using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()) 

def get_text_sentiment(text):
    ''' 
    Utility function to classify sentiment of passed
    text using textblob's sentiment method 
    '''
    # create TextBlob object of passed text 
    analysis = TextBlob(clean_text(text)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'   

In [6]:
sentiment = []
for message in dat['Message']:
    sentiment.append(get_text_sentiment(message))
    
dat["Sentiment"] = sentiment
dat.head()

Unnamed: 0,MessageID,Message,Sentiment
0,1000015,I don't like celebrity news but this needed to...,positive
1,1000038,"Ew! ""Have courage for the great sorrows of lif...",positive
2,1000069,"what? I am a traveler seeking the truth, a hum...",positive
3,1000099,Well it is as good a time as any to throw the ...,positive
4,1000132,"Ew X infinity. ""Have courage for the great sor...",positive


In [7]:
dat['Sentiment_Score'] = dat.Sentiment.map({'positive':1, 'negative':-1,'neutral':0})
dat.head()

#### Converting text in message column to lower case

In [8]:
dat['Message'] = [entry.lower() for entry in dat['Message']]
dat.head()

Unnamed: 0,MessageID,Message,Sentiment,Sentiment_Score
0,1000015,i don't like celebrity news but this needed to...,positive,1
1,1000038,"ew! ""have courage for the great sorrows of lif...",positive,1
2,1000069,"what? i am a traveler seeking the truth, a hum...",positive,1
3,1000099,well it is as good a time as any to throw the ...,positive,1
4,1000132,"ew x infinity. ""have courage for the great sor...",positive,1


#### Removing Punctuation and word-tokenization

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
dat['Message']= [tokenizer.tokenize(entry) for entry in dat['Message']]

In [10]:
dat['Message']=[" ".join(content) for content in dat['Message'].values]

#### Removing Stop Words

In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
dat['Message']=dat['Message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
dat.head()

Unnamed: 0,MessageID,Message,Sentiment,Sentiment_Score
0,1000015,like celebrity news needed told news link lett...,positive,1
1,1000038,ew courage great sorrows life patience small o...,positive,1
2,1000069,traveler seeking truth human searching meaning...,positive,1
3,1000099,well good time throw offer paypal account reas...,positive,1
4,1000132,ew x infinity courage great sorrows life patie...,positive,1


#### Modelling Process

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [13]:
Y=dat["Sentiment_Score"]

#### SPlitting Data into 70:30

In [14]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(dat['Message'],Y,test_size=0.3)
print(Train_X.shape,Test_X.shape,Train_Y.shape,Test_Y.shape)

(61622,) (26410,) (61622,) (26410,)


#### TFIDF Vectorizer

In [15]:
Tfidf_vect = TfidfVectorizer(stop_words='english')
Train_X_Tfidf = Tfidf_vect.fit_transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#### GBM CLassifier on train data

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
gbm1 = GradientBoostingClassifier(random_state=10)
gbm1.fit(Train_X_Tfidf,Train_Y)
gbm1.score(Train_X_Tfidf,Train_Y)

0.7567427217552173

#### Evaluation Of performance Metrices

In [20]:
prediction_gbm1 = gbm1.predict(Test_X_Tfidf)

print("Confusion Matrix:")
print(confusion_matrix(Test_Y, prediction_gbm1))

Confusion Matrix:
[[ 1297   175  2843]
 [    8  1345  3054]
 [  208   260 17220]]


In [21]:
print("Classification Report")
print(classification_report(Test_Y, prediction_gbm1))

Classification Report
              precision    recall  f1-score   support

          -1       0.86      0.30      0.45      4315
           0       0.76      0.31      0.43      4407
           1       0.74      0.97      0.84     17688

   micro avg       0.75      0.75      0.75     26410
   macro avg       0.79      0.53      0.57     26410
weighted avg       0.77      0.75      0.71     26410



#### C 5.0  Classifier on train data

In [42]:
from sklearn import tree
dec_tree1=tree.DecisionTreeClassifier(max_depth=5,random_state=42,criterion="entropy")
dec_tree1.fit(Train_X_Tfidf,Train_Y)
predictions_tree1= dec_tree1.predict(Test_X_Tfidf)

print("Decision tree Accuracy Score -> ",accuracy_score(predictions_tree1, Test_Y)*100)

Decision tree Accuracy Score ->  67.03142748958729


#### Evaluation Of performance Metrices

In [43]:
print(classification_report(Test_Y, predictions_tree1))

              precision    recall  f1-score   support

          -1       0.55      0.02      0.04      4315
           0       0.00      0.00      0.00      4407
           1       0.67      1.00      0.80     17688

   micro avg       0.67      0.67      0.67     26410
   macro avg       0.41      0.34      0.28     26410
weighted avg       0.54      0.67      0.54     26410



In [44]:
confusion_matrix(Test_Y, predictions_tree1)

array([[   78,     0,  4237],
       [    0,     0,  4407],
       [   63,     0, 17625]], dtype=int64)

#### Bagging CLassifier on train data

In [23]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
Bag=BaggingClassifier(oob_score=True,n_jobs=-1,n_estimators=20,random_state=400,
                      base_estimator=DecisionTreeClassifier())
Bag.fit(Train_X_Tfidf,Train_Y)
Bag.score(Train_X_Tfidf,Train_Y)

0.9960566031612087

#### Evaluation Of performance Metrices

In [24]:
prediction_bag = Bag.predict(Test_X_Tfidf)

print("Confusion Matrix:")
print(confusion_matrix(Test_Y, prediction_bag))

Confusion Matrix:
[[ 2250   365  1700]
 [   64  3957   386]
 [  761   585 16342]]


In [26]:
accuracy_score(Test_Y, prediction_bag)

0.8538053767512306

In [25]:
print("Classification Report")
print(classification_report(Test_Y, prediction_bag))

Classification Report
              precision    recall  f1-score   support

          -1       0.73      0.52      0.61      4315
           0       0.81      0.90      0.85      4407
           1       0.89      0.92      0.90     17688

   micro avg       0.85      0.85      0.85     26410
   macro avg       0.81      0.78      0.79     26410
weighted avg       0.85      0.85      0.85     26410



#### Random Forest Classifier on train data

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rf.fit(Train_X_Tfidf,Train_Y)
predictions_rf = rf.predict(Test_X_Tfidf)

print("Random Forest Accuracy Score -> ",accuracy_score(predictions_rf, Test_Y)*100)


Random Forest Accuracy Score ->  66.97463082165847


#### Evaluation Of performance Metrices

In [29]:
confusion_matrix(Test_Y, predictions_rf)

array([[    0,     0,  4315],
       [    0,     0,  4407],
       [    0,     0, 17688]], dtype=int64)

In [30]:
print("Classification Report")
print(classification_report(Test_Y, predictions_rf))

Classification Report
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      4315
           0       0.00      0.00      0.00      4407
           1       0.67      1.00      0.80     17688

   micro avg       0.67      0.67      0.67     26410
   macro avg       0.22      0.33      0.27     26410
weighted avg       0.45      0.67      0.54     26410



#### Decision Tree Classifier CLassifier on train data

In [32]:
from sklearn import tree
dec_tree=tree.DecisionTreeClassifier(max_depth=3,random_state=0)
dec_tree.fit(Train_X_Tfidf,Train_Y)
predictions_tree= dec_tree.predict(Test_X_Tfidf)

print("Decision tree Accuracy Score -> ",accuracy_score(predictions_tree, Test_Y)*100)

Decision tree Accuracy Score ->  67.00870882241576


#### Evaluation Of performance Metrices

In [33]:
print(classification_report(Test_Y, predictions_tree))

              precision    recall  f1-score   support

          -1       0.62      0.01      0.01      4315
           0       0.00      0.00      0.00      4407
           1       0.67      1.00      0.80     17688

   micro avg       0.67      0.67      0.67     26410
   macro avg       0.43      0.33      0.27     26410
weighted avg       0.55      0.67      0.54     26410



In [34]:
confusion_matrix(Test_Y, predictions_tree)

array([[   23,     0,  4292],
       [    0,     0,  4407],
       [   14,     0, 17674]], dtype=int64)

#### Logistic Regression on train data

In [36]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression(random_state=0)
log.fit(Train_X_Tfidf,Train_Y)
predictions_log= log.predict(Test_X_Tfidf)

print("Logistic Regression Accuracy Score -> ",accuracy_score(predictions_log, Test_Y)*100)

Logistic Regression Accuracy Score ->  86.72472548277167


#### Evaluation Of performance Metrices

In [37]:
print(classification_report(Test_Y, predictions_log))

              precision    recall  f1-score   support

          -1       0.89      0.52      0.66      4315
           0       0.81      0.84      0.82      4407
           1       0.88      0.96      0.92     17688

   micro avg       0.87      0.87      0.87     26410
   macro avg       0.86      0.77      0.80     26410
weighted avg       0.87      0.87      0.86     26410



In [38]:
confusion_matrix(Test_Y, predictions_log)

array([[ 2234,   418,  1663],
       [   36,  3685,   686],
       [  235,   468, 16985]], dtype=int64)

#### SVM  on train data

In [45]:
SVM= svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM= SVM.predict(Test_X_Tfidf)

print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  88.71639530480878


In [47]:
print(classification_report(Test_Y, predictions_SVM))

              precision    recall  f1-score   support

          -1       0.85      0.64      0.73      4315
           0       0.81      0.91      0.85      4407
           1       0.92      0.94      0.93     17688

   micro avg       0.89      0.89      0.89     26410
   macro avg       0.86      0.83      0.84     26410
weighted avg       0.89      0.89      0.88     26410



In [48]:
confusion_matrix(Test_Y, predictions_SVM)

array([[ 2760,   380,  1175],
       [   69,  4004,   334],
       [  436,   586, 16666]], dtype=int64)