<a href="https://colab.research.google.com/github/BHatiru/NLP_Project/blob/main/Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import glob
import torch
import pandas as pd

In [None]:
train_paths = ['train/california_wildfires_2018_train.tsv',
               'train/canada_wildfires_2016_train.tsv',
               'train/cyclone_idai_2019_train.tsv',
               'train/ecuador_earthquake_2016_train.tsv',
               'train/greece_wildfires_2018_train.tsv',
               'train/hurricane_dorian_2019_train.tsv',
               'train/hurricane_florence_2018_train.tsv',
               'train/hurricane_harvey_2017_train.tsv',
               'train/hurricane_irma_2017_train.tsv',
               'train/hurricane_maria_2017_train.tsv',
               'train/hurricane_matthew_2016_train.tsv',
               'train/italy_earthquake_aug_2016_train.tsv',
               'train/kaikoura_earthquake_2016_train.tsv',
               'train/kerala_floods_2018_train.tsv',
               'train/maryland_floods_2018_train.tsv',
               'train/midwestern_us_floods_2019_train.tsv',
               'train/pakistan_earthquake_2019_train.tsv',
               'train/puebla_mexico_earthquake_2017_train.tsv',
               'train/srilanka_floods_2017_train.tsv',
               'dev/california_wildfires_2018_dev.tsv',
               'dev/canada_wildfires_2016_dev.tsv',
               'dev/cyclone_idai_2019_dev.tsv',
               'dev/ecuador_earthquake_2016_dev.tsv',
               'dev/greece_wildfires_2018_dev.tsv',
               'dev/hurricane_dorian_2019_dev.tsv',
               'dev/hurricane_florence_2018_dev.tsv',
               'dev/hurricane_harvey_2017_dev.tsv',
               'dev/hurricane_irma_2017_dev.tsv',
               'dev/hurricane_maria_2017_dev.tsv',
               'dev/hurricane_matthew_2016_dev.tsv',
               'dev/italy_earthquake_aug_2016_dev.tsv',
               'dev/kaikoura_earthquake_2016_dev.tsv',
               'dev/kerala_floods_2018_dev.tsv',
               'dev/maryland_floods_2018_dev.tsv',
               'dev/midwestern_us_floods_2019_dev.tsv',
               'dev/pakistan_earthquake_2019_dev.tsv',
               'dev/puebla_mexico_earthquake_2017_dev.tsv',
               'dev/srilanka_floods_2017_dev.tsv']
test_paths = ['test/california_wildfires_2018_test.tsv',
              'test/canada_wildfires_2016_test.tsv',
              'test/cyclone_idai_2019_test.tsv',
              'test/ecuador_earthquake_2016_test.tsv',
              'test/greece_wildfires_2018_test.tsv',
              'test/hurricane_dorian_2019_test.tsv',
              'test/hurricane_florence_2018_test.tsv',
              'test/hurricane_harvey_2017_test.tsv',
              'test/hurricane_irma_2017_test.tsv',
              'test/hurricane_maria_2017_test.tsv',
              'test/hurricane_matthew_2016_test.tsv',
              'test/italy_earthquake_aug_2016_test.tsv',
              'test/kaikoura_earthquake_2016_test.tsv',
              'test/kerala_floods_2018_test.tsv',
              'test/maryland_floods_2018_test.tsv',
              'test/midwestern_us_floods_2019_test.tsv',
              'test/pakistan_earthquake_2019_test.tsv',
              'test/puebla_mexico_earthquake_2017_test.tsv',
              'test/srilanka_floods_2017_test.tsv']
train_file_paths = []

for path in train_paths:
    files = glob.glob(path)
    train_file_paths.extend(files)

test_file_paths = []

for path in test_paths:
    files = glob.glob(path)
    test_file_paths.extend(files)

In [None]:
train_dfs = []

for file in train_file_paths:
    df = pd.read_csv(file, sep='\t')
    df = df.iloc[:, 1:] # removing tweet ids
    df = df[df['class_label'] != 'missing_or_found_people']
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)

test_dfs = []

for file in test_file_paths:
    df = pd.read_csv(file, sep='\t')
    df = df.iloc[:, 1:] # removing tweet ids
    df = df[df['class_label'] != 'missing_or_found_people']
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)

In [None]:
def reduce_class_size(df, max_samples_per_class=3000):
    return df.groupby('class_label').apply(lambda x: x.sample(min(len(x), max_samples_per_class))).reset_index(drop=True)

train_df = reduce_class_size(train_df)

In [None]:
all_labels = pd.concat([train_df['class_label'], test_df['class_label']]).unique()

label_to_num = {label: num for num, label in enumerate(all_labels)}

train_df['class_label_num'] = train_df['class_label'].map(label_to_num)
test_df['class_label_num'] = test_df['class_label'].map(label_to_num)

In [None]:
train_tweets = list(train_df.tweet_text)
train_labels = list(train_df.class_label_num)

test_tweets = list(test_df.tweet_text)
test_labels = list(test_df.class_label_num)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 1), use_idf=True)),
    ('rf', RandomForestClassifier(random_state=42, n_estimators=200, max_depth=30, min_samples_split=5, min_samples_leaf=2))
])

In [None]:
param_grid = {
    'rf__max_depth': [200, 300, 400, 500]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)

In [None]:
grid_search.fit(train_tweets, train_labels)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'rf__max_depth': 300}
Best Score: 0.7017664865693375


In [None]:
test_preds = grid_search.predict(test_tweets)
print(accuracy_score(test_labels, test_preds))

0.6835233297985154


In [None]:
trains_preds = grid_search.predict(train_tweets)
print(accuracy_score(train_labels, trains_preds))

0.8398283327585546


In [None]:
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.54      0.68      0.60      1070
           1       0.73      0.92      0.81       790
           2       0.73      0.82      0.77      1617
           3       0.83      0.93      0.88      1447
           4       0.45      0.52      0.48      1245
           5       0.57      0.33      0.41      2407
           6       0.35      0.71      0.47       521
           7       0.83      0.73      0.78      4219
           8       0.76      0.73      0.74      1772

    accuracy                           0.68     15088
   macro avg       0.64      0.71      0.66     15088
weighted avg       0.70      0.68      0.68     15088

