# Training and Classifying the Tweets

Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Cleaning the text
import re
import nltk 
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#Creating Bag of words
from sklearn.feature_extraction.text import CountVectorizer

#Encode categorical variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#To splitiing training and testing data
from sklearn.model_selection import train_test_split

#Training
from sklearn.naive_bayes import GaussianNB

#Using Random Forest
from sklearn.ensemble import RandomForestClassifier

#Metrics
from sklearn.metrics import confusion_matrix


Importing Dataset

In [None]:
#Import Data set
dataset = pd.read_csv(filepath_or_buffer="all_news_tweets.tsv", delimiter="\t")

Cleaning the data

In [None]:
#Cleaning the Data
corpus = []
for i in range(0,dataset['Text'].count()):
    
    #Remove non alphabetic data
    tweet_text  = re.sub('[^a-zA-Z]',' ',dataset['Text'][i])
    
    #Convert all to lower case
    tweet_text = tweet_text.lower()
    
    #Steming the data
    tweet_text = tweet_text.split() 
    ps = PorterStemmer()
    tweet_text = [ps.stem(word) for word in tweet_text if not word in set(stopwords.words('english'))] 
    tweet_text = ' '.join(tweet_text)
    corpus.append(tweet_text)

Creating Bag of Words

In [None]:
#Creating Bag of words
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1:].values

Encoding Categorical data

In [None]:
#Encode categroical data
ct = ColumnTransformer([('encoder',OneHotEncoder(),[0])], remainder="passthrough")
y = np.array(ct.fit_transform(y), dtype = np.float)

#Avoid dummy variable trap
y = y[:,1:]

Splitting Data set into training and testing

In [None]:
#Splittng Dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

Training and Classifying Data using Naive Bayes

In [None]:
#Training
classifier = GaussianNB()
classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test,y_pred)

Looking at Metrics

In [None]:
#Accuracy
acc = (cm[0][0] + cm[1][1])/ np.sum(cm)
print("Accurancy is {0}".format(acc*100))

#Recall
recall = cm[1][1]/(cm[1][1] + cm[1][0])
print("Recall is {0}".format(recall*100))

#Precision
prec = cm[1][1]/(cm[1][1] + cm[0][1])
print("Precision is {0}".format(prec*100))

#F1 Score
f1_score = 2 * prec * recall/(prec+recall)
print("F1 Score {0}".format(f1_score))

Training and Classifying using Random Forest

In [None]:
#Using Random Forest
rf_classifier = RandomForestClassifier(n_estimators=10,criterion="entropy")
rf_classifier.fit(X_train,y_train)

y_pred2 = rf_classifier.predict(X_test)

cm_rf = confusion_matrix(y_test,y_pred2)

Looking at Metrics

In [None]:
#Accuracy
acc_rf = (cm_rf[0][0] + cm_rf[1][1])/ np.sum(cm_rf)
print("Accurancy is {0}".format(acc_rf*100))

#Recall
recall_rf = cm_rf[1][1]/(cm_rf[1][1] + cm_rf[1][0])
print("Recall is {0}".format(recall_rf*100))

#Precision
prec_rf = cm_rf[1][1]/(cm_rf[1][1] + cm_rf[0][1])
print("Precision is {0}".format(prec_rf*100))

#F1 Score
f1_score_rf = 2 * prec_rf * recall_rf/(prec_rf+recall_rf)
print("F1 Score {0}".format(f1_score_rf))