In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
init_notebook_mode(connected=True)

### Pipeline

1. Data Aquisition
2. Text Preprocessing (text lowercasing, stemming, removing stopwords)
3. Text Vectorization (BOW, Tf-Idf, Word2Vec)
4. Modelling
   - ML Based
      1. Random Forest
      2. Naive Bayes
      3. SVM
   - DL Based
      1. RNN
         - LSTM
      2. CNN
      3. Pretrained Models
         - BERT
         - LLAMA
         - ZEPHYR
5. Evaluation
   - Accuracy Score
   - Confusion Matrix
   - Precision/Recall
   - ROC AUC Curve
6. Deployment

### Data Acquisition

In [145]:
data_2016 = pd.read_csv("./resources/Trump_2016.csv")
data_2018 = pd.read_csv("./resources/Trump_2018.csv")

### Exploratory Data Analysis

In [4]:
data_2016['source'].value_counts()

source
Twitter for iPhone     1950
Twitter for Android    1275
Twitter Web Client      333
Twitter Ads              63
Twitter for iPad         22
Instagram                 2
TweetDeck                 2
Media Studio              1
Periscope                 1
Name: count, dtype: int64

In [5]:
data_2018['source'].value_counts()

source
Twitter for iPhone      3482
Media Studio              39
Twitter for iPad          17
Twitter Media Studio      12
Twitter Web Client         6
Name: count, dtype: int64

### Tweets from different Sources in the year 2016 and 2018

In [6]:
tweets_2016 = data_2016.source.value_counts()
tweets_2018 = data_2018.source.value_counts()
layout = {
    "title" : "Donald Trump Tweets from different sources in 2016 and 2018",
    "xaxis_title" : "Sources",
    "yaxis_title" : "Count"
}
iplot(go.Figure([go.Bar(x=tweets_2016.index,y=tweets_2016.values,name="2016"),go.Bar(x=tweets_2018.index,y=tweets_2018.values,name="2018")],layout=layout))

In [7]:
for tweet in data_2016[data_2016.source=="Twitter Ads"].text.values :
    print(tweet)

Donate Today To Help Make America Great Again! You Can Help Stop Crooked Hillary Clinton! https://t.co/VlObaDG4eR https://t.co/mBDwW5orrd
Donate Today To Help Make America Great Again! You Can Help Stop Crooked Hillary Clinton! https://t.co/swzCe5PiNI https://t.co/0fmgNBuAcc
URGENT: we’ve just announced a $2 million fundraising goal tonight. Please stand with us! https://t.co/Ssrh55C6hW https://t.co/2KjT4TJ07Y
#TextTrump88022 for exclusive @realDonaldTrump updates!  We will Make America Great Again!
Limited opportunity to get your OFFICIAL Trump gear! Shop now! https://t.co/LQMDdNkUwR https://t.co/KSAv65FuiD
Limited opportunity to get your OFFICIAL Trump gear! Shop now! https://t.co/3lUaSztKYx https://t.co/ssNVgF7PTt
Limited opportunity to get your OFFICIAL Trump gear! Shop now!  https://t.co/3lUaSztKYx
We’ve just set a new goal: raise $4 million from our grassroots supporters by MIDNIGHT! https://t.co/Ssrh55C6hW https://t.co/9Hd6dRcojF
We’ve just set a new goal: raise $4 million from 

### ReTweet Analysis

If it is a retweet then it is most likely that the tweet was retweeted by someone from his staff member

In [8]:
data_2016[data_2016.is_retweet==True].source.value_counts()

source
Twitter for iPhone     141
Twitter Web Client      43
Twitter for iPad         2
Twitter for Android      1
Name: count, dtype: int64

In [147]:
from datetime import datetime

def convert_date_to_hour(date_string):
    # Specify the input date format
    if "-" in date_string:
        date_format = "%m-%d-%Y %H:%M:%S"
    elif "/" in date_string:
        date_format = "%m/%d/%Y %H:%M"
    
    # Parse the input date string and extract the time
    datetime_obj = datetime.strptime(date_string, date_format)
    hour = datetime_obj.hour
    
    return hour

### Time based Analysis

As we can see in the below graphs that Actual Tweets made by Trump are spiking at a particular time of day and he is inactive in other time.

In [10]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=data_2016[data_2016.source=="Twitter for iPhone"].created_at.dropna().apply(convert_date_to_hour),name="Tweets from iPhone",opacity=0.5))
fig.add_trace(go.Histogram(x=data_2016[data_2016.source=="Twitter for Android"].created_at.dropna().apply(convert_date_to_hour),name="Tweets from Android",opacity=0.5))
fig.update_layout(
    title_text='Histogram of Tweet Creation at different Hours of a Day',
    xaxis_title_text='Hour',
    yaxis_title_text='Count',
    barmode='overlay'
)
fig.show()

### Text PreProcessing

In [11]:
# Taking only the data which is from iPhone or Android as their source for training the model as we are not sure about other sources.

data_2016_clean = data_2016[data_2016.source.isin(["Twitter for iPhone","Twitter for Android"])]


In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

sw_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(text):
    # Split the text into words
    text = text.lower()
    # lowering the text
    words = text.split()
    # Remove words that start with "http://" or "https:// and applys lemmatization"
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in sw_list and not word.startswith("http://") and not word.startswith("https://")]
    
    # Join the cleaned words back into a sentence
    cleaned_text = " ".join(cleaned_words)
    
    return cleaned_text

In [13]:
data_2016_clean.text = data_2016_clean.text.apply(preprocess_tweet)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
X = data_2016_clean.text
y = LabelEncoder().fit_transform(data_2016_clean.source == "Twitter for Android")

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100)

### Modelling

#### ML modelling

##### 1. BoW (Bag of Words)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [17]:
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

# This shows us that using Bag of words we got 5069 features
print("features : ",X_train_bow.shape[1])

# Training the model

gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)


features :  5069


##### Evaluation for **Bag of Words**(Text Vectorization) and **NaiveBayes** (Model)

In [18]:
y_pred = gnb.predict(X_test_bow)
accuracy = round(accuracy_score (y_test, y_pred)*100,4)
print(f"Acuracy using Bow and Naive Bayes techqiue is : {accuracy}%")


Acuracy using Bow and Naive Bayes techqiue is : 75.1938%


##### Checking if the accuracy improves by using high frequency features

In [19]:
accuracy_list = []
for max_features in range(1,5000,10):
    cv = CountVectorizer(max_features=max_features)
    X_train_bow = cv.fit_transform(X_train).toarray()
    X_test_bow = cv.transform(X_test).toarray()

    # This shows us that using Bag of words we got 5069 features
    # print("features : ",X_train_bow.shape[1])
    # Training the model

    gnb = GaussianNB()
    gnb.fit(X_train_bow,y_train)

    y_pred = gnb.predict(X_test_bow)
    accuracy = round(accuracy_score (y_test, y_pred)*100,4)
    accuracy_list.append(accuracy)
    # print(f"Acuracy using Bow and Naive Bayes techqiue is : {accuracy}%")



In [20]:
fig = go.Figure(data=go.Scatter(x = list(range(1,5000,10)),y=accuracy_list, mode='lines'))

fig.update_layout(
    title="Test Accuracy based on number of features",
    xaxis_title="Max Features",
    yaxis_title="Acuuracy( in %)"
)

fig.show()

From above graph we can see that using Bag of Words we got max accuracy of **79%** at 1550 features 

##### Using the same Bag of Words and pairing it with Random Forest Modelling

In [21]:
accuracy_list = []
for max_features in range(1,2000,50):
    cv = CountVectorizer(max_features=max_features)
    X_train_bow = cv.fit_transform(X_train).toarray()
    X_test_bow = cv.transform(X_test).toarray()

    # This shows us that using Bag of words we got 5069 features
    # print("features : ",X_train_bow.shape[1])
    # Training the model

    rf = RandomForestClassifier()
    rf.fit(X_train_bow,y_train)

    y_pred = rf.predict(X_test_bow)
    accuracy = round(accuracy_score (y_test, y_pred)*100,4)
    accuracy_list.append(accuracy)
    # print(f"Acuracy using Bow and Random Forest techqiue is : {accuracy}%")


In [22]:
fig = go.Figure(data=go.Scatter(x = list(range(1,2000,50)),y=accuracy_list, mode='lines'))

fig.update_layout(
    title="Test Accuracy based on number of features",
    xaxis_title="Max Features",
    yaxis_title="Acuuracy( in %)"
)


fig.show()

We can clearly see that using Random forest we got a maximum accuracy of 84% and with only using 1000 features. and if we want a even faster model with but we are ok with lower accuracy then we can go with 350 features and we will get an accuracy of 82%


| featrure size      | Accuracy (%)  |
| :---        |    ---: |
| 350      | 82   |
| 1000         | 84      |

##### Note
> From above we have seen that BoW performed well on Random Forest, Based on the assumption that Random Forest is a better model than Naive Bayes, we will now use Tf-Idf Techique to check if Tf-Idf type of vectorization will improve the accuracy.

#### 2. Tf-Idf (Term frequency Inverse Document Frequency)

In [23]:
accuracy_list = []
for max_features in range(1,5000,50):
    tfidf = TfidfVectorizer(max_features=max_features)
    X_train_bow = tfidf.fit_transform(X_train).toarray()
    X_test_bow = tfidf.transform(X_test).toarray()

    # This shows us that using Bag of words we got 5069 features
    # print("features : ",X_train_bow.shape[1])
    # Training the model

    rf = RandomForestClassifier()
    rf.fit(X_train_bow,y_train)

    y_pred = rf.predict(X_test_bow)
    accuracy = round(accuracy_score (y_test, y_pred)*100,4)
    accuracy_list.append(accuracy)
    # print(f"Acuracy using Tf-I
    # df and Random Forest techqiue is : {accuracy}%")

In [24]:
fig = go.Figure(data=go.Scatter(x = list(range(1,5000,50)),y=accuracy_list, mode='lines'))

fig.update_layout(
    title="Test Accuracy based on number of features",
    xaxis_title="Max Features",
    yaxis_title="Acuuracy( in %)"
)


fig.show()

We can clearly see that using Random forest we got a maximum accuracy of 82.6% and with only using 1050 features. and if we want a even faster model with but we are ok with lower accuracy then we can go with 350 features and we will get an accuracy of 81.55%


| featrure size      | Accuracy (%)  |
| :---        |    ---: |
| 350      | 81.55   |
| 1050         | 82.6     |

##### Note
> Now based on the results of using different Vectorization Techniques like Bag of Words and Tf-IDF, and considering accuracies alone we can say that Bag of words has got more accurate results.

> The Explaination which i think off is that a tweet is a very small sentence and there will be very small number of words remaining after we preprocess the tweet, so now almost all the words are important. Also tweets are based on current situations and they can be different situations in the tweets so even if we use Tfidf the the vectors which we get after transforming the sentence will almost look like the Bag Of Words. so that is why both the accuracies are almost the same. But if we have similar tweets all accross the year then only the TF-IDF will shine over BoW.

##### 3. Word2Vec

Tried Word2Vec on local but it is taking too long to load a model so i am using colab for this task. and attack the colab link here !!
colab link : https://colab.research.google.com/drive/1hagYRSPlWDFjyy7pwEFw0aUH65-XJqOx?usp=sharing

I have tried to load a model on colab, but the session is crashing, therefore I am not able to perform the analysis for Word2Vec technique for text vectorization process

In [1]:
import gensim.downloader
import gensim

In [None]:
# model = gensim.models.KeyedVectors.load_word2vec_format("./resources/GoogleNews-vectors-negative300.bin.gz",binary=True)

##### Using the time as a feature

But to make use time as a feature, we need to use cylindrical encoding. i have seen that there is a distribution with it follows to the tweets which turmp make and their staff make.


In [19]:
data_2016_creation_hour = data_2016_clean.created_at.apply(convert_date_to_hour)

In [29]:
data_2016_creation_hour_sin = data_2016_creation_hour.apply(lambda x : math.sin(2*math.pi*x/23))
data_2016_creation_hour_cos = data_2016_creation_hour.apply(lambda x : math.cos(2*math.pi*x/23))

In [67]:
X = data_2016_clean[["text","created_at"]]
X.created_at = X.created_at.apply(convert_date_to_hour)
X["sin"] = X.created_at.apply((lambda x : math.sin(2*math.pi*x/23)))
X["cos"] = X.created_at.apply((lambda x : math.cos(2*math.pi*x/23)))
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



##### Using Naive Bayes and training the model using the text and time information, and using BoW for vectorizing the text

In [109]:
cv = CountVectorizer(max_features=1000)
X_train_bow = cv.fit_transform(X_train.text).toarray()
X_test_bow = cv.transform(X_test.text).toarray()

X_train_final = np.concatenate([X_train_bow,X_train[["sin","cos"]]],axis=1)
X_test_final = np.concatenate([X_test_bow,X_test[["sin","cos"]]],axis=1)

# This shows us that using Bag of words we got 5069 features
# print("features : ",X_train_bow.shape[1])
# Training the model

rf = GaussianNB()
rf.fit(X_train_final,y_train)

y_pred = rf.predict(X_test_final)
accuracy = round(accuracy_score (y_test, y_pred)*100,4)
print("Using Random Forest and using the text and time information for training the model we achieved an accuracy of : ",accuracy)

Using Random Forest and using the text and time information for training the model we achieved an accuracy of :  77.9845


##### Using Random Forest and training the model using the text and time information, and using BoW for text vectorization

In [111]:
cv = CountVectorizer(max_features=1000)
X_train_bow = cv.fit_transform(X_train.text).toarray()
X_test_bow = cv.transform(X_test.text).toarray()

X_train_final = np.concatenate([X_train_bow,X_train[["sin","cos"]]],axis=1)
X_test_final = np.concatenate([X_test_bow,X_test[["sin","cos"]]],axis=1)

# This shows us that using Bag of words we got 5069 features
# print("features : ",X_train_bow.shape[1])
# Training the model

rf = RandomForestClassifier()
rf.fit(X_train_final,y_train)

y_pred = rf.predict(X_test_final)
accuracy = round(accuracy_score (y_test, y_pred)*100,4)
print("Using Random Forest and using the text and time information for training the model we achieved an accuracy of : ",accuracy)

Using Random Forest and using the text and time information for training the model we achieved an accuracy of :  82.4806


#### Final Conclusion on which model to go with

I am going with Random Forest which was trained on the text as well as Time information and used BoW for text classification and using a max features of 1000.

### Step 3
#### a. Use the classifier to determine
##### i. Who writes the tweets that come from the other devices during 2016. How confident can we be in these results?

In [114]:
data_2016_from_other_devices = data_2016[~data_2016.source.isin(["Twitter for iPhone","Twitter for Android"])]

In [136]:
X = data_2016_from_other_devices[['text','created_at']]
X.text = X.text.apply(preprocess_tweet)
X.created_at = X.created_at.apply(convert_date_to_hour)
X["sin"] = X.created_at.apply((lambda x : math.sin(2*math.pi*x/23)))
X["cos"] = X.created_at.apply((lambda x : math.cos(2*math.pi*x/23)))
X_bow = cv.transform(X.text).toarray()
X_final = np.concatenate([X_bow,X[["sin","cos"]]],axis=1)
y_pred = rf.predict(X_final)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [137]:
print("We have around ",y_pred.sum(), " tweets which i think are made by Trump out of ",y_pred.size)

We have around  64  tweets which i think are made by Trump out of  424


In [138]:
X["predicted"] = y_pred



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [139]:
X.show

Unnamed: 0,text,created_at,sin,cos,predicted
32,look forward lowell massachusetts today. hear ...,17,-0.997669,-0.068242,1
33,woody johnson owner nyjets @jebbush’s finance ...,20,-0.730836,0.682553,1
35,wow @unionleader circulation nh dropped 75000 ...,16,-0.942261,-0.334880,0
36,don’t know @samuelljackson best knowledge play...,17,-0.997669,-0.068242,1
40,don’t cheat golf @samuelljackson cheats—with g...,19,-0.887885,0.460065,0
...,...,...,...,...,...
3541,press would cover accurately &amp; honorably w...,16,-0.942261,-0.334880,0
3545,masa (softbank) japan agreed invest $50 billio...,19,-0.887885,0.460065,0
3546,masa said would never (trump) election!,19,-0.887885,0.460065,0
3590,.@billgates @jimbrownnfl32 trump tower office ...,18,-0.979084,0.203456,0


In [141]:
# X.to_csv("./output/2016_tweets_from_other_devices.csv")

#### Conclusion for Who writes the tweets that come from the other devices during 2016. How confident can we be in these results?

> As we can see the model which was giving an accuracy of 82-83 % is a good enough classifier for distinguishing between the tweets which are made by Trump and which are not. So when we gave the model, the tweets which are sent from other devices, most of them are classified as tweets which are not posted by Trump himself

> Out of 424 tweets which are tweeted from other device i.e not iphone and android phones the model predicted that only 64 are tweets from Trump so the percentage of tweets with which he uses other devices is very low becuase 64/424 is only 15 % So most likely he doesn't used other devices at all.

##### ii. Run the classifier on tweets from Jan-Dec ​2018​. Do you think this classifier is still valid, or has something changed [e.g., perhaps Trump is no longer posting tweets]? How reliable is it for this year?

In [148]:
X = data_2018[['text','created_at']]
X.text = X.text.apply(preprocess_tweet)
X.created_at = X.created_at.apply(convert_date_to_hour)
X["sin"] = X.created_at.apply((lambda x : math.sin(2*math.pi*x/23)))
X["cos"] = X.created_at.apply((lambda x : math.cos(2*math.pi*x/23)))
X_bow = cv.transform(X.text).toarray()
X_final = np.concatenate([X_bow,X[["sin","cos"]]],axis=1)
y_pred = rf.predict(X_final)
print("We have around ",y_pred.sum(), " tweets which i think are made by Trump out of ",y_pred.size)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

We have around  1788  tweets which i think are made by Trump out of  3556


#### Answer :

> Based on the above results i.e there are around 1788 tweets which the model predicts that are made by Trump out of 3556(accross all the devices) which is nearly 50% of the tweets. Even in 2016 if we assume that the tweets are made from Android and iPhone are only considered then there were 1275 tweets from Android(Assumption that tweets are made by Trump) and from iPhone are 1950. which means that tweets that are from Trump are almost 40%, and using our model which was giving a 83% accuracy on the test data of 2016 we can say that Trump is now tweeting even more that what he was in 2016. and we can be 80% confident on our model.

0.3953488372093023