# *TWITTER FILTER*

# Importing the DataSet

In [22]:
import pandas as pd

In [23]:
df=pd.read_csv("datasets_391623_755187_tweets.csv")

In [24]:
df.head()

Unnamed: 0,id,author,status
0,1,Donald J. Trump,I will be making a major statement from the @W...
1,2,Donald J. Trump,Just arrived at #ASEAN50 in the Philippines fo...
2,3,Donald J. Trump,"After my tour of Asia, all Countries dealing w..."
3,4,Donald J. Trump,Great to see @RandPaul looking well and back o...
4,5,Donald J. Trump,Excited to be heading home to see the House pa...


# Data Cleaning

In [26]:
df.info() #no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      400 non-null    int64 
 1   author  400 non-null    object
 2   status  400 non-null    object
dtypes: int64(1), object(2)
memory usage: 9.5+ KB


In [27]:
df=df.drop("id",axis=1) #removing unwanted columns

In [28]:
df.head()

Unnamed: 0,author,status
0,Donald J. Trump,I will be making a major statement from the @W...
1,Donald J. Trump,Just arrived at #ASEAN50 in the Philippines fo...
2,Donald J. Trump,"After my tour of Asia, all Countries dealing w..."
3,Donald J. Trump,Great to see @RandPaul looking well and back o...
4,Donald J. Trump,Excited to be heading home to see the House pa...


# Extracting Target and Features

In [29]:
y=df["author"]

In [30]:
x=df[["status"]]

# Checking 5 Rules
  a)Features and Targets should not have Null values
  
  b)Features should be of type Array/DataFrame
  
  c)Features should be in form of Rows and Columns
  
  d)Features should be continuous/Numeric
  
  e)Features should be in same scale

In [31]:
df.info() #Rule 1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  400 non-null    object
 1   status  400 non-null    object
dtypes: object(2)
memory usage: 6.4+ KB


In [32]:
type(x) #Rule 2

pandas.core.frame.DataFrame

In [33]:
x.shape #Rule 3

(400, 1)

# Train Test Split

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=21,stratify=y)

# * Rule 4 checked after Splitting. Rule 5 omitted because only one feature*

In [37]:
x.info() #Rule 4 : Features not numeric.Thus we need to convert to Numeric.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   status  400 non-null    object
dtypes: object(1)
memory usage: 3.2+ KB


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer #Converting text to Numeric type

In [55]:
t=TfidfVectorizer(max_features=100,stop_words="english")

In [66]:
x_train_new=t.fit_transform(x_train['status'])

In [70]:
x_test_new=t.transform(x_test['status'])

In [71]:
x_train_df=pd.DataFrame(x_train_new.toarray(),columns=t.get_feature_names()).add_prefix('Tfidf')

In [72]:
x_test_df=pd.DataFrame(x_test_new.toarray(),columns=t.get_feature_names()).add_prefix('Tfidf')

In [69]:
x_train_df

Unnamed: 0,Tfidf000,Tfidf11,Tfidfai,Tfidfamerica,Tfidfamp,Tfidfapec2017,Tfidfau,Tfidfaujourd,Tfidfaux,Tfidfavec,...,Tfidfveterans,Tfidfvietnam,Tfidfvotre,Tfidfvétérans,Tfidfwelcome,Tfidfwonderful,Tfidfwork,Tfidfworking,Tfidfworld,Tfidfyears
0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.396484,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.35088,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.317597,...,0.0,0.00000,0.0,0.355015,0.0,0.0,0.0,0.0,0.0,0.0
276,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
277,0.0,0.0,0.000000,0.0,0.877087,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
278,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
x_test_df

Unnamed: 0,Tfidf000,Tfidf11,Tfidfai,Tfidfamerica,Tfidfamp,Tfidfapec2017,Tfidfau,Tfidfaujourd,Tfidfaux,Tfidfavec,...,Tfidfveterans,Tfidfvietnam,Tfidfvotre,Tfidfvétérans,Tfidfwelcome,Tfidfwonderful,Tfidfwork,Tfidfworking,Tfidfworld,Tfidfyears
0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.770765,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.531312,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
116,0.0,0.0,0.0,0.0,0.322667,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
117,0.0,0.0,0.0,0.0,0.000000,0.43251,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.404494,0.0,0.000000,0.0
118,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


# Naive Bayes Model

In [76]:
from sklearn.naive_bayes import MultinomialNB

In [77]:
model=MultinomialNB()

In [78]:
model.fit(x_train_df,y_train) #training model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [79]:
model.score(x_test_df,y_test) #evaluating model performance

0.8916666666666667

# Implementing Model on New Data

In [87]:
tweet1="SAVE YOUR SECOND AMENDMENT, VOTE TRUMP!"
data1=t.transform([tweet])
data1_df=pd.DataFrame(data.toarray(),columns=t.get_feature_names())
model.predict(data1_df)

array(['Donald J. Trump'], dtype='<U15')

In [92]:
tweet2="ATTENTION CANADIANS: a new mobile app that will help limit the spread of COVID-19 is now available! The COVID Alert App will help us keep our families & communities safe & healthy. Get all the details here: "
data2=t.transform([tweet2])
data2_df=pd.DataFrame(data2.toarray(),columns=t.get_feature_names())
model.predict(data2)

array(['Justin Trudeau'], dtype='<U15')