In [None]:
import xml.etree.ElementTree as ET
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
p = Path("./data/raw/en")

In [None]:
def read_data(path):
    """read data from xml files"""
    
    tweets_paths = [i for i in p.glob("*.xml")]
    targets_path = [i for i in p.glob("*.txt")][0]
    tweets = []
    user_ids = []
    # read tweets from xml files
    for file_path in tweets_paths:
        file_name = file_path.stem
        tree = ET.parse(file_path)
        root = tree.getroot()
        documents = root.findall("./documents/")
        user_tweets = [doc.text for doc in documents]
        
        tweets.append(user_tweets)
        user_ids.append(file_name)
        
    # get target mapping
    with open(targets_path) as f:
        content = f.read()
        content = content.split("\n")
        target_map = {}
        for i in content:
            try:
                user_id, target = i.split(":::")
            except:
                continue
            target_map[user_id] = int(target)
    # prepare dataframe
    df = pd.DataFrame({
        "user_id":user_ids,
        "tweets": tweets
    })
    df["target"]= df.user_id.map(target_map)
    
    return df

In [None]:
df = read_data(path = p)
# df.tweets = df.tweets.apply(lambda x: " ".join(x))
# df.head()

In [None]:
# df = pd.read_csv("./data/preprocessed/disaster/nlp_disaster.csv")
# df = df.rename({"text":"tweets"}, axis = 1)

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.2,
                                     stratify = df.target.values,
                                     random_state = 42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
class Tweet_Selection:
    """Filter most important tweets using chi-square"""
    
    def fit(self, df_train, nimportant_words=4000):
        """get list of important words (keep_words)"""
        
        self.keep_words = self.get_n_important_words(df_train, N=nimportant_words)
    
    def get_n_important_words(self, df_train, N):

        vectorizer = TfidfVectorizer(sublinear_tf=True)
        X_train = vectorizer.fit_transform(
            [" ".join(user_tweets) for user_tweets in df_train.tweets]
        )

        feature_names = vectorizer.get_feature_names()
        ch2 = SelectKBest(chi2, k=N)
        X_train = ch2.fit_transform(X_train, df_train.target)

        keep_words = np.array(feature_names)[ch2.get_support(indices=True)]
        return keep_words
    
    def transform(self, df, keepn_tweets=300):
        """keep top n tweets in which important words occurs most frequently"""

        selected_tweets, scores = self.select_topn_tweets(
            df.tweets, self.keep_words, keep_topn=keepn_tweets
        )

        df["TopN_Tweets"] = selected_tweets
        df["Tweet_Scores"] = scores

        return df
    
    def select_topn_tweets(self, tweet_list, keep_words, keep_topn):
        """Select top N tweets using Chi-Square"""

        temp_tweet_list = []
        temp_scores_list = []

        for user_tweets in tqdm(tweet_list, total=len(tweet_list)):
            tweet_scores = {}
            for tweet_idx, tweet in enumerate(user_tweets):
                tweet_score = 0
                for word in tweet.split():
                    if word.lower() in keep_words:
                        tweet_score += 1

                tweet_scores[tweet_idx] = tweet_score

            selected_tweet_idx = list(
                {
                    k: v
                    for k, v in sorted(
                        tweet_scores.items(), key=lambda item: item[1], reverse=True
                    )
                }.keys()
            )[:keep_topn]
            selected_tweet_scores = list(
                {
                    k: v
                    for k, v in sorted(
                        tweet_scores.items(), key=lambda item: item[1], reverse=True
                    )
                }.values()
            )[:keep_topn]
            temp_user_tweets = np.array(user_tweets)[selected_tweet_idx]
            temp_tweet_list.append(temp_user_tweets)
            temp_scores_list.append(selected_tweet_scores)

        return temp_tweet_list, temp_scores_list
    

In [None]:
ts = Tweet_Selection()

In [None]:
ts.fit(train_df) 

train_df = ts.transform(train_df, keepn_tweets=30)
test_df = ts.transform(test_df, keepn_tweets=30)

In [None]:
full_df = pd.concat([train_df, test_df], axis = 0)

In [None]:
train_df.to_pickle("./data/preprocessed/train_df.pkl")
test_df.to_pickle("./data/preprocessed/test_df.pkl")
full_df.to_pickle("./data/preprocessed/full_df.pkl")