# Twitter Disaster Test

In [1]:
from os import path
import pandas as pd

DATASET_FOLDER = "../data/nlp-getting-started"
TRAIN_FILE = path.join(DATASET_FOLDER, "train.csv") 

In [2]:
# read data
df_tweets = pd.read_csv(TRAIN_FILE)

In [3]:
# split data into train and test
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_tweets, test_size=0.2, random_state=42)

## Data Cleaning

In [4]:
'''
Data cleaning
'''
def data_cleaning_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.keyword.fillna("", inplace=True)
    df.location.fillna("", inplace=True)
    return df

# Data Preprocessing

In [5]:
'''
Data preprocessing
'''
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def encode_location(df: pd.DataFrame) -> None:
    le = LabelEncoder()
    df["location"] = le.fit_transform(df["location"])

def data_preprocessing_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.text.apply(lambda x: x.lower())
    df.keyword.apply(lambda x: x.lower())
    df.location.apply(lambda x: x.lower())
    encode_location(df)
    return df

In [6]:
'''
Data vectorization
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def data_vectorization_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df_vec = df.copy()
    stop_words = stopwords.words('english')
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
    X = vectorizer.fit_transform(df_vec.text)
    df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    df_features["target"] = df["target"]
    df_features["location"] = df["location"]
        
    return df_features

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guilhermeleonardonunes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preparation Pipeline

In [7]:
'''
Data preparation pipeline

apply all the above pipelines
'''
def data_preparation_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df = data_cleaning_pipeline(df)
    display(df.target.isna().sum())
    df = data_preprocessing_pipeline(df)
    display(df.target.isna().sum())
    df = data_vectorization_pipeline(df)
    display(df.target.isna().sum())
    return df

## Feature Selection Pipeline

In [8]:
def get_most_correlated_features(df: pd.DataFrame, correlation_value: float = 0.1) -> pd.Series:
    # select most correlated features
    df_corr = df.corr()
    df_corr_target = df_corr["target"].abs()
    df_corr_target = df_corr[df_corr_target > correlation_value]
    return df_corr_target

## Apply LDA

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


## Training and Prediction

In [10]:
# prepare data for training
df_train = data_preparation_pipeline(df_train)

0

0

1222