## P2 - Predicting Persuasiveness of Comments

### Downloading and loading essential libraries

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
pip install keras-metrics

In [None]:
pip install gensim

In [None]:
pip install textblob

In [None]:
# Required Libraries
import gensim 
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import keras_metrics
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import Sequential
from keras import layers

from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.collocations import *
from nltk.stem import WordNetLemmatizer
from nltk import cluster

from textblob import TextBlob

import matplotlib.pyplot as plt

### Loading the Training and Dataset csv files into Pandas dataframes

In [None]:
df_train_org = pd.read_csv("P2_Training_Dataset.csv")

In [None]:
df_test_org = pd.read_csv("P2_Testing_Dataset.csv")

In [None]:
print(df_train_org.shape, " ", df_test_org.shape)

In [None]:
df_train_set = df_train_org
df_test_set = df_test_org

frames = [df_train_set, df_test_set]
df = pd.concat(frames, axis=0)

#### Comments that are null are removed from processing

In [None]:
df = df[~df['text'].isnull()]
df.reset_index(inplace = True, drop = True) 

### Feature - Count
* The length of the comment in terms of the words is calculated.

In [None]:
for i in df:
    df['count'] = [str(len(i.split())) for i in df['text']]

### Data Preprocessing
* The comment is cleanized
* Digits and numeric forms are removed from the dataset.
* The comments are tokenized into words.
* The Stopwords from english, punctuations, and other ASCII characters are all removed from the dataset.
* Lematization(WordNetLemmatizer) is performed on the dataset to reduced the word to root form if appicable.

In [None]:
def preprocessing(text):
    sent = ''.join(i for i in text if not i.isdigit())
    word_sent = word_tokenize(sent)
    _stopwords = set(stopwords.words('english') + list(punctuation))
    st = WordNetLemmatizer()
    _stopwords.add("'s")
    _stopwords.add("'ve")
    _stopwords.add("‚Äö√†√ú")
    word_sent = [st.lemmatize(word.lower()) for word in word_sent if word not in _stopwords]
    return word_sent

for i in df:
    df['text_tokenized'] = [preprocessing(i) for i in df['text']]

### Word Embeddings
* The Word2Vec embedding is utilized to identify the embeddings of words in the entire dataset.
* The words are represented in numeric embedded forms.
* The comments are vectorized and consists of word emneddings.

In [None]:
model = gensim.models.Word2Vec(df['text_tokenized'], min_count = 1,size = 100, window = 5) 
dictionary = gensim.corpora.Dictionary(df['text_tokenized'])

In [None]:
df = df[df['text_tokenized'].map(lambda d: len(d)) > 0]
df.reset_index(inplace = True, drop = True)

In [None]:
list_text = []
for j in df['text_tokenized']:
    res = sum(np.array(model[j]))
    val = res/len(j)
    list_text.append(val)

    
df['vectorized'] = list_text

### Cosine Similarity
* Cosine similarity is the dot product between two vectors.
* The cosine similarity is calculated between each CMV/Opinion post and related Reply comments.

In [None]:
init = 0
l_cosine = []
l_cosine.append(0)
for i in range(len(df)-1):
    a_author = df['thread_id'][init]
    b_author = df['thread_id'][i+1]
    if(a_author == b_author):
        a_text = df['vectorized'][init]
        b_text = df['vectorized'][i+1]
        l_cosine.append(cluster.util.cosine_distance(a_text, b_text))
    else:
        init = i+1
        l_cosine.append(0)
        
df['cosine'] = l_cosine

### Sentiment Score
* The feature sentiment score is derived by calculating the sentiment associated with each text field in the dataset.
* The polarity of the score is considered as the sentiment of the respective comment.

In [None]:
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

df['sentiment'] = df['text'].apply(sentiment_calc)

### Hedge Frequency
* Hedging words are considered soft words, as they make the conversation seem less direct, and to limit or qualify claims.
* The feature consists of the frequency of hedge words in certain comment.

In [None]:
df_hedges = pd.DataFrame()
df_hedges = pd.read_csv("hedge_words.txt", header=None)

In [None]:
l_hedge = []
for i in df['text_tokenized']:
    val = np.intersect1d(i,df_hedges)
    l_hedge.append(len(val))
    
df['hedge'] = l_hedge

### Tone Word Count
* This lexicon is derived from the "Tone Word Bank".
###### http://www.foothillfalcons.org/ourpages/auto/2013/8/30/62075757/Tone.pdf
* Tone words could be essential to capture diction, viewpoint, subject matter, attitude and personality.

In [None]:
df_tone = pd.DataFrame()
df_tone = pd.read_csv("Tone-words.txt", header=None)

In [None]:
l_tone = []
for i in df['text_tokenized']:
    val = np.intersect1d(i,df_tone)
    l_tone.append(len(val))
    
df['tone'] = l_tone

In [None]:
df.head()

### Data Models
* Two models are created to train the model and evaluate on the test dataset.
1. Random Forest
2. Kernel Support Vector Machines

#### Random Forest

In [None]:
# Random Forest

def random_forest_model(x_train,y_train,x_test):
    rfc = RandomForestClassifier()
    rfc.fit(x_train,y_train)
    # predictions
    rfc_predict = rfc.predict(x_test)
    return rfc_predict

#### Kernel Support Vector Machine

In [None]:
# Kernel SVM

def kernal_SVM(x_train, y_train, x_test):
    svclassifier = SVC(kernel='sigmoid')
    svclassifier.fit(x_train, y_train)
    # prediction
    y_pred = svclassifier.predict(x_test)
    return y_pred

In [None]:
def metric_evaluation(y_test,y_pred):
    fpr, tpr, threshold = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return roc_auc, fpr, tpr, accuracy, precision, recall, f1

In [None]:
def plot_graph(roc_value, fpr, tpr):
    plt.figure()
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, label = 'P AUC = %0.2f' % roc_value)
    plt.legend(loc = 'lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### Phase I - Baseline Features
* count - Length of a comment in terms of words.
* cosine - Cosine Similarity between each CMV and following RE vectors.
* sentiment - Sentiment score associated with the comment.
* hedge - frequency of the hedge words in comment.

In [None]:
X = df[['count','cosine','sentiment','hedge']].copy()
y = df[['delta']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.13, random_state=42)

In [None]:
r_forest_pred = random_forest_model(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,r_forest_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, r_forest_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
k_svm_pred = kernal_SVM(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,k_svm_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, k_svm_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
phase_1 = pd.DataFrame(r_forest_pred)
phase_1.to_csv('Feature_set2_Predictions.csv', index=False)

### Phase II - Baseline Features + Two Additional Features
* author_score - native feature of the dataset, it pertains to the score assigned by the author to the comment.
* count - Length of the comment in terms of words.
* cosine - Dot product between the CMV comment / Original Post and RE comments.
* sentiment - Sentiment score associated with respect to the comments.
* hedge - Count of the number of hedge words.
* tone - Count of number of tone words - derived from "Tone word bank".

In [None]:
X = df[['author_score','count','cosine','sentiment','hedge', 'tone']].copy()
y = df[['delta']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.13, random_state=42)

In [None]:
r_forest_pred = random_forest_model(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,r_forest_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, r_forest_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
k_svm_pred = kernal_SVM(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,k_svm_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, k_svm_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
phase_2 = pd.DataFrame(r_forest_pred)
phase_2.to_csv('Feature_set2_Predictions.csv', index=False)

### Phase III - Custom Features
* author_score - native feature of the dataset, it pertains to the score assigned by the author to the comment.
* count - Length of the comment in terms of words.
* sentiment - Sentiment score associated with respect to the comments.
* tone - Count of number of tone words - derived from "Tone word bank".

In [None]:
X = df[['author_score','count','sentiment','tone']].copy()
y = df[['delta']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.13, random_state=42)

In [None]:
r_forest_pred = random_forest_model(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,r_forest_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, r_forest_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
k_svm_pred = kernal_SVM(X_train,y_train,X_test)
roc_auc, fpr, tpr, accuracy, precision, recall, f1 = metric_evaluation(y_test,k_svm_pred)

print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 score: %f' % f1)
print('Confusion Matrix: ', confusion_matrix(y_test, k_svm_pred))

plot_graph(roc_auc, fpr, tpr)

In [None]:
phase_3 = pd.DataFrame(r_forest_pred)
phase_3.to_csv('Feature_set_Predictions.csv', index=False)