In [None]:
from collections import Counter
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from joblib import dump, load

from src import chi_square_filtering
from src import data_loading
from src import preprocessing

In [None]:
import warnings
warnings.filterwarnings("ignore")

#### Model for Misformation Spreader classification

In [None]:
p = Path("./data/raw/en")

full_df_pan = data_loading.read_data(path = p)
full_df_pan.head()

In [None]:
ts = chi_square_filtering.Tweet_Selection()

In [None]:
kf = KFold(n_splits=3, shuffle = True, random_state= 42)

results = []
for train_index, test_index in kf.split(full_df_pan):
#     full_df_pan = ts.transform(full_df_pan, keepn_tweets=30)
    train_df = full_df_pan.iloc[train_index]
    test_df  = full_df_pan.iloc[test_index]
    ts.fit(train_df)
    train_df = ts.transform(train_df, keepn_tweets=30)
    test_df  = ts.transform(test_df, keepn_tweets=30)
    
    train_df.TopN_Tweets = train_df.TopN_Tweets.apply(lambda x: " ".join(x))
    test_df.TopN_Tweets  = test_df.TopN_Tweets.apply(lambda x: " ".join(x))
    
    X_train = train_df.TopN_Tweets
    y_train = train_df.target
    X_test = test_df.TopN_Tweets
    y_test = test_df.target

    pipe = Pipeline([('cv', CountVectorizer()),
                 ("clf", MultinomialNB())])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    results.append(accuracy_score(y_test, pred))

print(np.mean(results))

In [None]:
ts.fit(full_df_pan)
full_df_pan = ts.transform(full_df_pan, keepn_tweets=30)
full_df_pan.TopN_Tweets = full_df_pan.TopN_Tweets.apply(lambda x: " ".join(x))

pipe = Pipeline([('cv', CountVectorizer()),
                 ("clf", MultinomialNB())])
pipe.fit(full_df_pan.TopN_Tweets, full_df_pan.target)
    
dump(pipe, './model/pan.joblib') 
ts.save_keep_words(path = "./model/keep_words.pkl")

#### Model for Disaster Classifier.

In [None]:
full_df_disaster = pd.read_csv("./data/raw/disaster/nlp_disaster.csv")
full_df_disaster = full_df_disaster.rename({"text":"tweets"}, axis = 1)
full_df_disaster.tweets = full_df_disaster.tweets.apply(lambda x: preprocessing.preprocess(x))
full_df_disaster.to_csv("./data/preprocessed/disaster/nlp_disaster.csv")
full_df_disaster = pd.read_csv("./data/preprocessed/disaster/nlp_disaster.csv")

In [None]:
X = full_df_disaster.tweets
y = full_df_disaster.target

pipe = Pipeline([('cv', CountVectorizer()),
                 ('clf', MultinomialNB())])
pipe.fit(X, y)
# cross_validate(pipe, X, y, cv=3)

In [None]:
dump(pipe, './model/disaster.joblib') 

In [None]:
from predict_sample import predict_tweet

In [None]:
tweets = ["Images showing the havoc caused by the #Cameroon military as they torched houses in #Oku.The shameless military is reported…",
      "The speeding car rammed into a group of people, who were returning after attending a temple festival of Ayyappan Ka… https://t.co/e3bBlaVDDA",
      "Army IDs Two Paratroopers Killed by Roadside Bomb in Afghanistan | #URL# #URL#",
      "Russia Blames Trump’s Iran Strike on Impeachment: “For Trump, the annihilation of an Iranian General presents a decen…'"]

In [None]:
predict_tweet(tweets)