In [42]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer

In [57]:
# Import the dataset
# There is a total of 4511 tweets 
# Take 10% to form classifier and code rest (450)
df = pd.read_excel("/Users/user/Documents/Python /Twitter Webscraping/Analysis Scripts for Thesis/Excel Files for Thesis/Test_Data_For_Classifier.xlsx", headers = True)

In [58]:
# Split tweets into training and test set
# Use 80% as training set and 20% as test set 
train_data, test_data, train_labels, test_labels = train_test_split(df['tweetText'], df['Labels'], test_size = 0.2, random_state = 1)

In [59]:
# Check data and labels are same length 
print(len(train_data))
print(len(train_labels))

360
360


In [60]:
# Create the counter to vectorize the data 
counter = CountVectorizer(ngram_range=(1,2))  # Create counter object using unigrams and bigrams
counter.fit(train_data)  # Fit counter on the training set 
train_counts = counter.transform(train_data)  # Transform training set into count vectors
test_counts = counter.transform(test_data)  # Transform testing set into count vectors
print(train_data[1])
print(train_counts[1])  # See what tweet looks like as a count vector

youre going dahhhhn you schlaaaaaaag
  (0, 12)	1
  (0, 14)	1
  (0, 71)	1
  (0, 87)	1
  (0, 184)	1
  (0, 185)	1
  (0, 409)	2
  (0, 446)	1
  (0, 451)	1
  (0, 759)	1
  (0, 807)	1
  (0, 1593)	1
  (0, 1835)	1
  (0, 1876)	1
  (0, 2041)	1
  (0, 2048)	1
  (0, 2690)	1
  (0, 2693)	1
  (0, 2707)	1
  (0, 2712)	1
  (0, 2780)	1
  (0, 2782)	1
  (0, 3037)	1
  (0, 3043)	1
  (0, 3653)	1
  :	:
  (0, 4768)	1
  (0, 4784)	1
  (0, 4821)	1
  (0, 4828)	1
  (0, 4978)	4
  (0, 4979)	1
  (0, 4982)	1
  (0, 5028)	1
  (0, 5058)	1
  (0, 5110)	1
  (0, 5113)	1
  (0, 5133)	1
  (0, 5135)	1
  (0, 5228)	1
  (0, 5241)	1
  (0, 5248)	1
  (0, 5261)	1
  (0, 5306)	1
  (0, 5313)	1
  (0, 5467)	1
  (0, 5482)	1
  (0, 5526)	1
  (0, 5544)	1
  (0, 5705)	1
  (0, 5714)	1


In [47]:
# Fit Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs') 
model.fit(train_counts, train_labels)
predictions = model.predict(test_counts)

In [61]:
# Naive Bayes
# Create the classifier and train it
classifier = MultinomialNB()  # Create classifier model
classifier.fit(train_counts, train_labels)  # Fit the model to the vector counts of the training data
predictions = classifier.predict(test_counts)  # Make a list of predictions from the vector counts of the test data

In [62]:
# NB gives higher accuracy at 85%
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))

0.8555555555555555


In [51]:
# Use classifier to predcit class for remaining tweets

In [63]:
# Import dataset
df2 = pd.read_excel("/Users/user/Documents/Python /Twitter Webscraping/Analysis Scripts for Thesis/Excel Files for Thesis/Remaining Boris Replies for Classification.xlsx", headers = True)

In [64]:
# Create list of tweet text, classify and add back to dataframe
tweets_to_predict = df2['tweetText']

# Transform tweets into vector counts
new_tweet_counts = counter.transform(tweets_to_predict)

# Create a list of predicted labels for the new tweets
new_predictions = classifier.predict(new_tweet_counts)

# Add the labels back into the dataframe
df2["Labels"] = new_predictions

In [65]:
# Create datasets for supportive and unsupportive tweets
# Split into two dataframes - for and agaisnt 
favour_tweets = []
against_tweets = []

for index, row in df2.iterrows():
    if row['Labels'] == 1:
        favour_tweets.append(row)
    else:
        against_tweets.append(row)
        
favour_tweets = pd.DataFrame(favour_tweets)
against_tweets = pd.DataFrame(against_tweets)

favour_tweets.drop('Unnamed: 0', axis=1, inplace=True)
against_tweets.drop('Unnamed: 0', axis=1, inplace=True)

In [66]:
# Save dataframes
favour_tweets.to_excel('Supportive_Tweets_For_Johnson.xlsx')
against_tweets.to_excel('Unsupportive_Tweets_For_Johnson.xlsx')