In [104]:
import pandas as pd
import numpy as np
import folium

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

from folium import plugins
from folium.plugins import HeatMap
from IPython.display import display

Load tweets from csv file

In [105]:
df = pd.read_csv("agaton_leyte.csv", encoding='latin-1')

Extract latitude and longitude from the coordinates column.

In [106]:
df['coordinates'] = df['coordinates'].str.replace("{'longitude':", "",regex=True)
df['coordinates'] = df['coordinates'].str.replace(", 'latitude':", "")
df['coordinates'] = df['coordinates'].str.replace("}", "", regex=True)
df['Longitude'] = df['coordinates'].str.split(' ', expand = True)[1]
df['Latitude'] = df['coordinates'].str.split(' ', expand = True)[2]

Counts tweets in each rating

In [107]:
df['rating'].value_counts()

0    59
1    21
2    20
Name: rating, dtype: int64

Split data into training data and testing data

In [108]:
X = df['content']
Y = df['rating']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.50)

Instantiate vectorizer and fit on the training data

In [109]:
# instantiate tfidfvectorizer
vect = TfidfVectorizer(analyzer = "word", ngram_range=(1, 2))

# fit on the training data, transform training and test data
train_data = vect.fit_transform(X_train)
test_data = vect.transform(X_test)
train_data = train_data.toarray()

print(train_data.shape)
print(test_data.shape)

(50, 1531)
(50, 1531)


### Comparing Models

Using logistic regression model,

In [110]:
# Logistic Regression model
lr = LogisticRegression()

lr.fit(train_data, Y_train)

lr.fit(train_data, Y_train)
print("Accuracy score (training): {0:.3f}".format(lr.score(train_data, Y_train)))

# accuracy score of model
print("Accuracy score (testing): {0:.3f}".format(lr.score(test_data, Y_test)))

# print predictions in test data
lr_pred = lr.predict(test_data)
print(lr_pred)

Accuracy score (training): 0.820
Accuracy score (testing): 0.600
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


Using Gradient Boosting Classifier model,

In [111]:
# Gradient Boosting Classifier model
gb = GradientBoostingClassifier()

gb.fit(train_data, Y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(train_data, Y_train)))

# accuracy score of model
print("Accuracy score (testing): {0:.3f}".format(gb.score(test_data, Y_test)))

#print predictions in test data
gb_pred = gb.predict(test_data)
print(gb_pred)

Accuracy score (training): 1.000
Accuracy score (testing): 0.700
[0 0 0 0 1 2 0 0 0 0 2 0 1 0 0 2 1 1 2 2 2 0 1 0 0 1 0 0 0 1 2 2 2 0 2 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


Get the accuracy and ROC AUC score of both models

In [112]:
print(f"Accuracy score: {accuracy_score(Y_test, gb_pred)}")
print(f"ROC AUC score: {roc_auc_score(Y_test, gb.predict_proba(test_data), multi_class = 'ovo')}")

Accuracy score: 0.7
ROC AUC score: 0.7953703703703704


In [113]:
print(f"Accuracy score: {accuracy_score(Y_test, lr_pred)}")
print(f"ROC AUC score: {roc_auc_score(Y_test, lr.predict_proba(test_data), multi_class = 'ovo')}")

Accuracy score: 0.6
ROC AUC score: 0.8700231481481482


In [114]:
print(wrong_pred['new_score'].value_counts())

0     27
10     6
2      4
9      4
4      3
5      2
6      2
1      2
Name: new_score, dtype: int64


Choose the best model and save the predictions in wrongpred.csv. Gradient Boosting Classifier was chosen because of its high accuracy and sufficient ROC AUC score.

In [115]:
# save predictions in csv file
wrong_pred = pd.DataFrame(X_test, columns =['content'])

wrong_pred.loc[:, 'coordinates'] = df['coordinates']
wrong_pred.loc[:, 'Latitude'] = df['Latitude']
wrong_pred.loc[:, 'Longitude'] = df['Longitude']
wrong_pred.loc[:, 'actual'] = Y_test
wrong_pred.loc[:, 'predicted'] = gb_pred
wrong_pred.loc[:, 'new_score'] = wrong_pred['actual'] + (4* wrong_pred['predicted'])
wrong_pred.head()

wrong_pred.to_csv("wrongpred.csv")

Extract tweets with rank = 2

In [116]:
df_locs = pd.DataFrame(columns = ['Latitude', 'Longitude'])
df_locs['Latitude'] = wrong_pred.loc[wrong_pred['predicted'] == 2, 'Latitude']
df_locs['Longitude'] = wrong_pred.loc[wrong_pred['predicted'] == 2, 'Longitude']

df_locs.head()

Unnamed: 0,Latitude,Longitude
39,10.5470262,124.7434968
18,10.7922629,124.8438494
4,10.5470262,124.7434968
12,10.5470262,124.7434968
31,10.5470262,124.7434968


Plot the tweets in the geographical heatmap

In [117]:
fol_map = folium.Map(location=[10.847622263721211, 124.88887070186077], zoom_start=8)
markers = df_locs[['Latitude','Longitude']].values
fol_map.add_child(plugins.HeatMap(markers, radius=15))
fol_map