In [15]:
import pandas as pd
import folium

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, f1_score

from folium import plugins
from folium.plugins import HeatMap
from IPython.display import display

In [16]:
# load tweets from csv file
df = pd.read_csv("agaton_leyte.csv", encoding='latin-1')
df['coordinates'] = df['coordinates'].str.replace("{'longitude':", "")
df['coordinates'] = df['coordinates'].str.replace(", 'latitude':", "")
df['coordinates'] = df['coordinates'].str.replace("}", "")
df['Longitude'] = df['coordinates'].str.split(' ', expand = True)[1]
df['Latitude'] = df['coordinates'].str.split(' ', expand = True)[2]
df.head()

  df['coordinates'] = df['coordinates'].str.replace("{'longitude':", "")
  df['coordinates'] = df['coordinates'].str.replace("}", "")


Unnamed: 0,id,rating,date,content,coordinates,place,Longitude,Latitude
0,0,2,2022-04-12 21:21:06+00:00,HEAR US!!! HELP US!! https://t.co/E6NDZKtQ4c,124.7434968 10.5470262,"{'fullName': 'City Of Baybay, Eastern Visayas'...",124.7434968,10.5470262
1,1,0,2022-04-12 20:56:00+00:00,Grabe Yung ulan kala ko Hindi na titigil\n#pra...,124.3892082 11.5046982,"{'fullName': 'Naval, Eastern Visayas', 'name':...",124.3892082,11.5046982
2,2,1,2022-04-12 20:53:09+00:00,"We know too well that Biliranons,LeyteÃ±os and...",124.3892082 11.5046982,"{'fullName': 'Naval, Eastern Visayas', 'name':...",124.3892082,11.5046982
3,3,1,2022-04-12 20:44:43+00:00,The flood waters entered our residence past 12...,124.8055272 11.32366,"{'fullName': 'Babatngon, Eastern Visayas', 'na...",124.8055272,11.32366
4,4,2,2022-04-12 20:40:02+00:00,HELP US !!!! \n\n#BaybayNeedsHelp #BaybayCityN...,124.7434968 10.5470262,"{'fullName': 'City Of Baybay, Eastern Visayas'...",124.7434968,10.5470262


In [17]:
# counts tweets in each rating
df['rating'].value_counts()

0    59
1    21
2    20
Name: rating, dtype: int64

In [18]:
# baseline model accuracy score
print(59/(59+21+20))

0.59


In [19]:
# split data into training data and testing data
X = df['content']
Y = df['rating']

#X.head()
#Y.head()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.50)

# instantiate countvectorizer
vect = CountVectorizer(analyzer = "word", ngram_range=(1, 2))

# fit on the training data, transform training and test data
train_data = vect.fit_transform(X_train)
test_data = vect.transform(X_test)
train_data = train_data.toarray()

print(train_data.shape)
print(test_data.shape)

(50, 1531)
(50, 1531)


In [20]:
# Logistic Regression model
lr = LogisticRegression()

test = lr.fit(train_data, Y_train)
print("Accuracy score (training): {0:.3f}".format(lr.score(train_data, Y_train)))

# accuracy score of model
print("Accuracy score (testing): {0:.3f}".format(lr.score(test_data, Y_test)))

Accuracy score (training): 1.000
Accuracy score (testing): 0.760


In [21]:
# Gradient Boosting Classifier model
gb = GradientBoostingClassifier()

gb.fit(train_data, Y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(train_data, Y_train)))

# accuracy score of model
print("Accuracy score (testing): {0:.3f}".format(gb.score(test_data, Y_test)))

Accuracy score (training): 1.000
Accuracy score (testing): 0.680


In [22]:
# print predictions in test data
lr_pred = lr.predict(test_data)
print(lr_pred)

[0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 2 0 1 0 0 0 0 2 0 0 0 0 0 2 0 0 2 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [23]:
# print predictions in test data
gb_pred = gb.predict(test_data)
print(gb_pred)

[0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 2 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [24]:
# save predictions in csv file
wrong_pred = pd.DataFrame(X_test, columns =['content'])

wrong_pred.loc[:, 'coordinates'] = df['coordinates']
wrong_pred.loc[:, 'Latitude'] = df['Latitude']
wrong_pred.loc[:, 'Longitude'] = df['Longitude']
wrong_pred.loc[:, 'actual'] = Y_test
wrong_pred.loc[:, 'predicted'] = gb_pred
wrong_pred.loc[:, 'new_score'] = wrong_pred['actual'] + (4* wrong_pred['predicted'])
wrong_pred.head()

wrong_pred.to_csv("wrongpred.csv")

In [25]:
print(wrong_pred['new_score'].value_counts())
print(f"ROC AUC score: {roc_auc_score(gb_pred, Y_test)}")

0     30
2      8
1      8
10     4
Name: new_score, dtype: int64
ROC AUC score: 0.9130434782608696


In [26]:
fol_map = folium.Map(location=[10.847622263721211, 124.88887070186077], zoom_start=8)

df_locs = pd.DataFrame(columns = ['Latitude', 'Longitude'])
df_locs['Latitude'] = wrong_pred.loc[wrong_pred['predicted'] == 2, 'Latitude']
df_locs['Longitude'] = wrong_pred.loc[wrong_pred['predicted'] == 2, 'Longitude']

df_locs.head()

markers = df_locs[['Latitude','Longitude']].values
fol_map.add_child(plugins.HeatMap(markers, radius=15))
fol_map