## 1. Importing the libraries

<p style="font-size:18px;">
    To utilise the pre-written code and functionality the libraries given below are imported. 
</p>

In [1]:
import numpy as np
import pandas as pd
import re
import emoji
import inflect
from tqdm import tqdm
import copy
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import pickle
from sentence_transformers import SentenceTransformer,util
from sentence_transformers import SentencesDataset, LoggingHandler, losses
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


## 2. Importing the dataset

In [2]:
data = pd.read_csv('/Users/savinaysingh/Downloads/tweets 2.csv')

In [3]:
data = data[['text','target']]

## 3. Data Preprocessing

In [4]:
# Converting the words having apostophe into their root form
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"there\'s", "there is", phrase)
    phrase = re.sub(r"it\'s", "it is", phrase)
    phrase = re.sub(r"he\'s", "he is", phrase)
    phrase = re.sub(r"she\'s", "she is", phrase)
    phrase = re.sub(r"how\'s", "how is", phrase)
    phrase = re.sub(r"let\'s", "let is", phrase)
    phrase = re.sub(r"so\'s", "so is", phrase)
    phrase = re.sub(r"what\'s", "what is", phrase)
    phrase = re.sub(r"when\'s", "when is", phrase)
    phrase = re.sub(r"where\'s", "where is", phrase)
    phrase = re.sub(r"why\'s", "why is", phrase)
  
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [5]:
# De-emojize the text
def remove_emojis(text):
    return emoji.demojize(text)

# url not req while finding if text is disaster or not
def remove_urls(text):
    pattern = re.compile(r'http\S+|www\S+')
    return pattern.sub('', text)

def remove_hashtags(text):
    pattern = re.compile(r'#\w+')
    return pattern.sub('', text)

# spl chars not req while finding if text is disaster or not
def remove_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    return pattern.sub('', text)

In [6]:
def preprocess_text(text):
    text = remove_urls(text)
    text = remove_hashtags(text)
    text = remove_special_characters(text)
    text = remove_emojis(text)
    return text

In [7]:
def removeDash(input_string):
        return re.sub(r'-', ' ', input_string)

In [8]:
def lemmatize_text(preprocessed_text):
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        # print(tag)
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
   # 2. Lemmatize a Sentence with the appropriate POS tag
    lemmatized_text=[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(preprocessed_text)]
    # The final processed set of words for each iteration will be stored in 'text_final'
    text_final=(" ".join( lemmatized_text ))
    return text_final

In [9]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [10]:
def preprocess_data(data):
    """
    Preprocesses a DataFrame containing text data by applying various cleaning and text processing steps.
    
    Args:
    data (DataFrame): The input DataFrame containing text data.
    
    Returns:
    DataFrame: The preprocessed DataFrame with cleaned text data.
    """

    print("1. Capturing Hashtags")
    # 1. Capturing Hashtags
    pattern = r'#\w+'
    data['hashtags'] = data['text'].str.findall(pattern)
    data.hashtags=['' if [] == d else ' '.join([remove_special_characters(w) for w in d]) for d in data.hashtags]
    # 2. Handling Apostrophes
    print("2. Handling Apostrophes")
    sentences = data['text']
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (before handling): ",number_sent_apostophe/len(sentences)*100,"%")
    sentences = [decontracted(x) for x in sentences]
    data['text'] = sentences
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (after handling): ",number_sent_apostophe/len(sentences)*100,"%")

    # 3. Lowercasing
    print("3. Lowercasing the sentences")
    data['text'] = [entry.lower() for entry in sentences]

    # 4. Removing Emojis, URLs, Hashtags, and Special Characters
    print("4. Removing Emojis, URLs, Hashtags, and Special Characters")
    data['text'] = data['text'].apply(preprocess_text)
    
    
    # 5. Handling Numerics
    print("5. Handling Numerics")
    p = inflect.engine()
    sentences = list(data.text)
    data.text = sentences
    final_ans = []
    for i, sent in tqdm(enumerate(sentences), total=len(sentences)):
        list_numbers = re.findall(r'[0-9]+', sent)
        list_numbers = [int(l) for l in list_numbers]
        list_words = [p.number_to_words(l) for l in list_numbers]
        dict_NW = dict(zip(list_numbers, list_words))
        myKeys = list(dict_NW.keys())
        myKeys.sort(reverse=True)
        sorted_dict = {i: dict_NW[i] for i in myKeys}
        ans = copy.copy(sent)
        for key in sorted_dict.keys():
            ans = re.sub(str(key), ' ' + sorted_dict[key] + ' ', ans)
        final_ans.append(ans)
    sentences = final_ans.copy()
    data.text = sentences
    data['text'] = data['text'].apply(removeDash)

    # 6. Removing Duplicate Tweets
    print("6. Removing Duplicate Tweets")
    data = data.drop_duplicates(subset=['text'])
    sentences=data.text
    # 7. Lemmatization
    print("7. Performing Lemmatization")
    ans = []
    for x in tqdm(sentences, total=len(sentences)):
        ans.append(lemmatize_text(x))
    sentences = ans.copy()
    data.text = sentences

    # 8. Stopwords Removal
    print("8. Removing Stopwords")
    stop_words = stopwords.words('english')
    sentences = [' '.join([word for word in x.split() if word not in stop_words]) for x in sentences]
    data.text = sentences
    
    print("Done")
    return data

In [11]:
data=preprocess_data(data)

1. Capturing Hashtags
2. Handling Apostrophes
Percentage of sentences with apostophe is (before handling):  16.710642040457344 %
Percentage of sentences with apostophe is (after handling):  8.988566402814424 %
3. Lowercasing the sentences
4. Removing Emojis, URLs, Hashtags, and Special Characters
5. Handling Numerics


100%|██████████████████████████████████| 11370/11370 [00:00<00:00, 82372.94it/s]


6. Removing Duplicate Tweets
7. Performing Lemmatization


100%|████████████████████████████████████| 10938/10938 [00:18<00:00, 599.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


8. Removing Stopwords
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


## 4. Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[['text','hashtags']], data.target, test_size=0.2, random_state=42)

## 5. Vectorization

### >Vectorization for Statistical

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer2 = TfidfVectorizer(ngram_range=(1,3))

In [14]:
matrix = vectorizer.fit_transform(X_train['text'])
countv_train=pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out())
matrix2 = vectorizer2.fit_transform(X_train['hashtags'])
countv_train2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())

In [15]:
countv_train_3=pd.concat([countv_train, countv_train2], axis=1)

In [58]:
# Testing Set
matrix = vectorizer.transform(X_test['text'])
countv_test=pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out())
matrix2 = vectorizer2.transform(X_test['hashtags'])
countv_test2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())
countv_test_3=pd.concat([countv_test, countv_test2], axis=1)

In [17]:
X_train_TF=countv_train_3
X_test_TF=countv_test_3
y_train_TF, y_test_TF=y_train, y_test

### >Vectorization for Contextual

In [18]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [50]:
X_train_ST=model.encode(list(X_train.text))

In [51]:
X_test_ST=model.encode(list(X_test.text))

In [52]:
X_train_ST_x=np.concatenate((X_train_ST, countv_train2), axis=1)

In [61]:
X_test_ST_x=np.concatenate((X_test_ST, countv_test2), axis=1)

## 6. Oversampling

In [64]:
X_train_ST_x_sm, y_train_ST_x_sm = smote.fit_resample(X_train_ST_x, y_train)

## 7. Modelling

In [65]:
rf_classifier=RandomForestClassifier(n_estimators=100, random_state=42,class_weight={0:5,1:1})

In [66]:
rf_classifier.fit(X_train_ST_x_sm, y_train_ST_x_sm)

In [67]:
y_pred=rf_classifier.predict(X_test_ST_x)

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1794
           1       0.63      0.73      0.68       398

    accuracy                           0.87      2192
   macro avg       0.79      0.82      0.80      2192
weighted avg       0.88      0.87      0.88      2192



## 8. Filtering out tweets in Harvey Dataset

In [35]:
df_harvey=pd.read_csv('/Users/savinaysingh/Downloads/Hurricane_Harvey.csv', header= 0,
                        encoding= 'unicode_escape')

In [36]:
df_harvey['text']=df_harvey['Tweet']

In [37]:
df_harvey=df_harvey[~df_harvey['Tweet'].isna()]

In [38]:
df_harvey=preprocess_data(df_harvey)

1. Capturing Hashtags
2. Handling Apostrophes
Percentage of sentences with apostophe is (before handling):  17.777928185382386 %
Percentage of sentences with apostophe is (after handling):  10.42575379277843 %
3. Lowercasing the sentences
4. Removing Emojis, URLs, Hashtags, and Special Characters
5. Handling Numerics


100%|████████████████████████████████| 398916/398916 [00:05<00:00, 76203.48it/s]


6. Removing Duplicate Tweets
7. Performing Lemmatization


100%|██████████████████████████████████| 258806/258806 [05:50<00:00, 738.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


8. Removing Stopwords
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


In [39]:
encoding=model.encode(list(df_harvey[0:50000].text))

In [44]:
encoding.shape

(50000, 768)

In [69]:
matrix2 =  vectorizer2.transform(df_harvey[0:50000]['hashtags'])
countv_test2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())

In [70]:
countv_test2.shape

(50000, 2427)

In [71]:
df_X=np.concatenate((encoding, countv_test2), axis=1)

In [72]:
y_pred=rf_classifier.predict(df_X)

In [75]:
pd.Series(y_pred).value_counts()

0    29678
1    20322
Name: count, dtype: int64

In [80]:
df_harvey=df_harvey[0:50000]

In [81]:
df_harvey['IsDisaster']=y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_harvey['IsDisaster']=y_pred


In [82]:
df_harvey.to_csv('/Users/savinaysingh/Downloads/harvey_emotion_final_2.csv')