In [None]:
# Importing required Python Libraries 

import pandas as pd
import seaborn as sns
import csv
import re
import string
from sklearn.metrics import accuracy_score , classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer , CountVectorizer
import pickle
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.svm import SVC

# Reading Training Data

In [None]:
data=pd.read_csv('train data.csv') 

In [None]:
print('Training Dataset Structure\n',data)

# Dataset Information 

In [None]:
print('Information about Training Dataset',data.info())

# Checking for Missing Values in Dataset and Dropping Missing Value

In [None]:
print('Information about missing value in dataset',data.isna().value_counts())
data=data.dropna()   
print('Checking if all missed value are drop',data.isna().value_counts())


# Counting Number of Anti-state(1) and Non-Anti-state(0) Label in Dataset

In [None]:

print(" Number of Tweet for specific labels in dataset",data['label'].value_counts())
# 0 label for Non-Anti state and 1 for Anti-state
sns.countplot(x="label",hue ="label",data=data)

# Removing Duplicate Rows from Dataset

In [None]:
print('Total Tweets in Dataset that are not duplicate:',(~data.duplicated()).sum())
print('Total Tweets in Dataset that are duplicate:',(data.duplicated()).sum())


In [None]:
data=data.drop_duplicates()

In [None]:
print('Total Tweets in Dataset that are not duplicate after dropping:',(~data.duplicated()).sum())
print('Total Tweets in Dataset that are duplicate after dropping:',(data.duplicated()).sum())


# Removing Spaces from beginning of Tweets

In [None]:
data['tweet'].apply(lambda x:x.strip())

# Removing Numbers and Punctuations if any from dataset 

In [None]:
 string.punctuation

In [None]:
def removeDigits(tweet):
    tweet=tweet.strip()
    tweet=[t for t in tweet if t not in string.digits]
    tweet=''.join(tweet)
    tweet=[t for t in tweet if t not in string.punctuation]
    return ''.join(tweet)

data['tweet']=data['tweet'].apply(removeDigits)

# Dividing Training dataset into Tweet and Label

In [None]:
X=data['tweet']
trainY=data['label']


# Extracting Features from dataset as CountVectors 

In [None]:
vectorizer=CountVectorizer()
trainX=vectorizer.fit_transform(X)
print('Features Corpus and Its occurrance')
print(vectorizer.vocabulary_)

# Testing Data 

In [None]:
test_data=pd.read_csv('testingdata.csv')
test_data=test_data.dropna()
X_test=vectorizer.transform(test_data['tweet'])
Y_test=test_data['label']

# Model Classfier SVM

In [None]:
model=SVC(kernel='linear')
model.fit(trainX,trainY)

pred=model.predict(X_test)

print('Confusion Matrix of SVM:\n')
cm = confusion_matrix(Y_test, pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)

disp.plot()
plt.show()



print('Accurracy of SVM classifier:',100 *accuracy_score(pred,Y_test),'%\n')
print('SVM classification report:\n',classification_report(pred,Y_test))

# Model Classifier Logistic Regression

In [None]:
lg=LogisticRegression()
lg.fit(trainX,trainY)
predlg=lg.predict(X_test)

print('Confusion Matrix of Logistic Regression:\n')
cmlg = confusion_matrix(Y_test, predlg, labels=lg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cmlg,display_labels=lg.classes_)

disp.plot()
plt.show()



print('Accurracy of Logistic Regression classifier:',100 *accuracy_score(predlg,Y_test),'%\n')
print('Logistic Regression classification report:\n',classification_report(predlg,Y_test))

# Model Classifier RandomForest

In [None]:
randomforest=RandomForestClassifier()
randomforest.fit(trainX,trainY)
predrandomF=randomforest.predict(X_test)

print('Confusion Matrix of Random Forest:\n')
cmrf = confusion_matrix(Y_test, predrandomF, labels=randomforest.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cmrf,display_labels=randomforest.classes_)

disp.plot()
plt.show()



print('Accurracy of Random Forestclassifier:',100 *accuracy_score(predrandomF,Y_test),'%\n')
print('Random Forest classification report:\n',classification_report(predrandomF,Y_test))

# Model Classifier KNN

In [None]:
KNN=KNeighborsClassifier()
KNN.fit(trainX,trainY)
predknn=KNN.predict(X_test)

print('Confusion Matrix of KNN:\n')
cmknn = confusion_matrix(Y_test, predknn, labels=KNN.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cmknn,display_labels=KNN.classes_)

disp.plot()
plt.show()



print('Accurracy of KNN:',100 *accuracy_score(predknn,Y_test),'%\n')
print('KNN classification report:\n',classification_report(predknn,Y_test))

# For Model Creation we selected RandomForest Classifier

In [None]:
pipe=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('trnasformer',TfidfTransformer()),
    ('model' , RandomForestClassifier())
    
]) 

In [None]:
pipe.fit(X,trainY)

# Evaluate Pipeline on Test data 

In [None]:
predp=pipe.predict(test_data['tweet'])

print('Confusion Matrix of pipeline model:\n')
cmp = confusion_matrix(Y_test, predp, labels=pipe.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cmp,display_labels=pipe.classes_)

disp.plot()
plt.show()



print('Accurracy of pipeline:',100 *accuracy_score(predp,Y_test),'%\n')
print('Pipeline classification report:\n',classification_report(predp,Y_test))

# Saving Pipeline

In [None]:
pickle.dump(pipe,open('model.pkl','wb'))
# loading and checking if our saved model works
RandomForest=pickle.load(open('model.pkl','rb'))
preds=RandomForest.predict(test_data['tweet'])
accuracy_score(preds,Y_test)
#same accuracy as pipeline ,yes our model is working

# Predicting Label for Test data

In [None]:
result={}
for i,j in zip(test_data['tweet'][:],preds):
    if j==0.0:
        result[i]='Not anti State'
    else:
        result[i]='Anit State'
        
with open('Test_label_result.csv', 'w') as f:
    for key in result.keys():
        f.write("%s,%s\n"%(key,result[key]))
print('Saved file')

In [None]:
result