<h1 style="text-align: center"> This notebook provides a comparison between two distinct approaches: Contextual Analysis using Sentence Transformers and Statistical Analysis using TF-IDF. The objective is to determine the most suitable method for predicting whether a given tweet represents a disaster or not.</h1>

In [99]:
import numpy as np
import pandas as pd
import re
import emoji
import inflect
from tqdm import tqdm
import copy
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import pickle
from sentence_transformers import SentenceTransformer,util
from sentence_transformers import SentencesDataset, LoggingHandler, losses
from sklearn.metrics import classification_report


In [100]:
# Converting the words having apostophe into their root form
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"there\'s", "there is", phrase)
    phrase = re.sub(r"it\'s", "it is", phrase)
    phrase = re.sub(r"he\'s", "he is", phrase)
    phrase = re.sub(r"she\'s", "she is", phrase)
    phrase = re.sub(r"how\'s", "how is", phrase)
    phrase = re.sub(r"let\'s", "let is", phrase)
    phrase = re.sub(r"so\'s", "so is", phrase)
    phrase = re.sub(r"what\'s", "what is", phrase)
    phrase = re.sub(r"when\'s", "when is", phrase)
    phrase = re.sub(r"where\'s", "where is", phrase)
    phrase = re.sub(r"why\'s", "why is", phrase)
  
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [101]:
# De-emojize the text
def remove_emojis(text):
    return emoji.demojize(text)

# url not req while finding if text is disaster or not
def remove_urls(text):
    pattern = re.compile(r'http\S+|www\S+')
    return pattern.sub('', text)

def remove_hashtags(text):
    pattern = re.compile(r'#\w+')
    return pattern.sub('', text)

# spl chars not req while finding if text is disaster or not
def remove_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    return pattern.sub('', text)

In [102]:
def preprocess_text(text):
    text = remove_urls(text)
    text = remove_hashtags(text)
    text = remove_special_characters(text)
    text = remove_emojis(text)
    return text

In [103]:
def removeDash(input_string):
        return re.sub(r'-', ' ', input_string)

In [104]:
def lemmatize_text(preprocessed_text):
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        # print(tag)
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
   # 2. Lemmatize a Sentence with the appropriate POS tag
    lemmatized_text=[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(preprocessed_text)]
    # The final processed set of words for each iteration will be stored in 'text_final'
    text_final=(" ".join( lemmatized_text ))
    return text_final

In [105]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

# 1. Data Cleaning for both

In [106]:
data = pd.read_csv('/Users/savinaysingh/Downloads/tweets 2.csv')


In [107]:
data = data[['text','target']]

In [10]:
def preprocess_data(data):
    """
    Preprocesses a DataFrame containing text data by applying various cleaning and text processing steps.
    
    Args:
    data (DataFrame): The input DataFrame containing text data.
    
    Returns:
    DataFrame: The preprocessed DataFrame with cleaned text data.
    """

    print("1. Capturing Hashtags")
    # 1. Capturing Hashtags
    pattern = r'#\w+'
    data['hashtags'] = data['text'].str.findall(pattern)
    data.hashtags=['' if [] == d else ' '.join([remove_special_characters(w) for w in d]) for d in data.hashtags]
    # 2. Handling Apostrophes
    print("2. Handling Apostrophes")
    sentences = data['text']
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (before handling): ",number_sent_apostophe/len(sentences)*100,"%")
    sentences = [decontracted(x) for x in sentences]
    data['text'] = sentences
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (after handling): ",number_sent_apostophe/len(sentences)*100,"%")

    # 3. Lowercasing
    print("3. Lowercasing the sentences")
    data['text'] = [entry.lower() for entry in sentences]

    # 4. Removing Emojis, URLs, Hashtags, and Special Characters
    print("4. Removing Emojis, URLs, Hashtags, and Special Characters")
    data['text'] = data['text'].apply(preprocess_text)
    
    
    # 5. Handling Numerics
    print("5. Handling Numerics")
    p = inflect.engine()
    sentences = list(data.text)
    data.text = sentences
    final_ans = []
    for i, sent in tqdm(enumerate(sentences), total=len(sentences)):
        list_numbers = re.findall(r'[0-9]+', sent)
        list_numbers = [int(l) for l in list_numbers]
        list_words = [p.number_to_words(l) for l in list_numbers]
        dict_NW = dict(zip(list_numbers, list_words))
        myKeys = list(dict_NW.keys())
        myKeys.sort(reverse=True)
        sorted_dict = {i: dict_NW[i] for i in myKeys}
        ans = copy.copy(sent)
        for key in sorted_dict.keys():
            ans = re.sub(str(key), ' ' + sorted_dict[key] + ' ', ans)
        final_ans.append(ans)
    sentences = final_ans.copy()
    data.text = sentences
    data['text'] = data['text'].apply(removeDash)

    # 6. Removing Duplicate Tweets
    print("6. Removing Duplicate Tweets")
    data = data.drop_duplicates(subset=['text'])
    sentences=data.text
    # 7. Lemmatization
    print("7. Performing Lemmatization")
    ans = []
    for x in tqdm(sentences, total=len(sentences)):
        ans.append(lemmatize_text(x))
    sentences = ans.copy()
    data.text = sentences

    # 8. Stopwords Removal
    print("8. Removing Stopwords")
    stop_words = stopwords.words('english')
    sentences = [' '.join([word for word in x.split() if word not in stop_words]) for x in sentences]
    data.text = sentences
    
    print("Done")
    return data

In [11]:
data=preprocess_data(data)

1. Capturing Hashtags
2. Handling Apostrophes
Percentage of sentences with apostophe is (before handling):  16.710642040457344 %
Percentage of sentences with apostophe is (after handling):  8.988566402814424 %
3. Lowercasing the sentences
4. Removing Emojis, URLs, Hashtags, and Special Characters
5. Handling Numerics


100%|██████████████████████████████████| 11370/11370 [00:00<00:00, 85656.00it/s]


6. Removing Duplicate Tweets
7. Performing Lemmatization


100%|████████████████████████████████████| 10938/10938 [00:17<00:00, 624.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


8. Removing Stopwords
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


# 2. Division B/w Test and train

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[['text','hashtags']], data.target, test_size=0.2, random_state=42)

# 3.1. Vectorization for Statistical

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer2 = TfidfVectorizer(ngram_range=(1,3))

In [14]:
matrix = vectorizer.fit_transform(X_train['text'])
countv_train=pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out())
matrix2 = vectorizer2.fit_transform(X_train['hashtags'])
countv_train2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())

In [15]:
countv_train_3=pd.concat([countv_train, countv_train2], axis=1)

In [16]:
# Testing Set
matrix = vectorizer.transform(X_test['text'])
countv_test=pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out())
matrix2 = vectorizer2.transform(X_test['hashtags'])
countv_test2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())
countv_test_3=pd.concat([countv_test, countv_test2], axis=1)

In [17]:
X_train_TF=countv_train_3
X_test_TF=countv_test_3
y_train_TF, y_test_TF=y_train, y_test

# 3.2. Vectorization for Contextual

In [18]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [30]:
X_train_ST=model.encode(list(X_train.text))

In [31]:
X_test_ST=model.encode(list(X_test.text))

In [32]:
X_train_ST_hash=model.encode(list(X_train.hashtags))
X_test_ST_hash=model.encode(list(X_test.hashtags))

In [33]:
X_train_ST = np.concatenate((X_train_ST, X_train_ST_hash), axis=1)
X_test_ST = np.concatenate((X_test_ST, X_test_ST_hash), axis=1)

# 4. Oversampling

In [20]:
X_train_TF_sm, y_train_tf_sm = smote.fit_resample(X_train_TF, y_train)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [34]:
X_train_ST_sm, y_train_st_sm = smote.fit_resample(X_train_ST, y_train)

# 5. Random Forest Classifier

## 5.1. Base

In [35]:
rf_classifier=RandomForestClassifier(n_estimators=133,min_samples_split=30,max_depth=140,random_state=15)

In [22]:
rf_classifier.fit(X_train_TF_sm, y_train_tf_sm)

In [23]:
y_pred=rf_classifier.predict(X_test_TF)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1774
           1       0.81      0.39      0.52       414

    accuracy                           0.87      2188
   macro avg       0.84      0.68      0.72      2188
weighted avg       0.86      0.87      0.85      2188



In [36]:
rf_classifier.fit(X_train_ST_sm, y_train_st_sm)

In [37]:
y_pred=rf_classifier.predict(X_test_ST)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      1774
           1       0.70      0.58      0.64       414

    accuracy                           0.87      2188
   macro avg       0.80      0.76      0.78      2188
weighted avg       0.87      0.87      0.87      2188



## 5.2. Enhanced

In [26]:
rf_classifier=RandomForestClassifier(n_estimators=133,min_samples_split=30,max_depth=140,random_state=15,class_weight={0:1,1:3})

In [27]:
rf_classifier.fit(X_train_TF_sm, y_train_tf_sm)

In [28]:
y_pred=rf_classifier.predict(X_test_TF)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      1774
           1       0.48      0.74      0.59       414

    accuracy                           0.80      2188
   macro avg       0.71      0.78      0.73      2188
weighted avg       0.85      0.80      0.82      2188



In [86]:
rf_classifier=RandomForestClassifier(n_estimators=133,min_samples_split=30,max_depth=140,random_state=15,class_weight={0:1,1:5})

In [87]:
rf_classifier.fit(X_train_ST_sm, y_train_st_sm)

In [88]:
X_train_ST_sm.shape

(14434, 1536)

In [89]:
X_test_ST.shape

(2188, 1536)

In [90]:
y_pred=rf_classifier.predict(X_test_ST)

In [91]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92      1774
           1       0.66      0.57      0.61       414

    accuracy                           0.86      2188
   macro avg       0.78      0.75      0.76      2188
weighted avg       0.86      0.86      0.86      2188



In [92]:
rf_classifier=RandomForestClassifier(n_estimators=100, random_state=42,class_weight={0:5,1:1})

In [93]:
rf_classifier.fit(X_train_ST_sm, y_train_st_sm)

In [94]:
X_train_ST_sm.shape

(14434, 1536)

In [95]:
X_test_ST.shape

(2188, 1536)

In [96]:
y_pred=rf_classifier.predict(X_test_ST)

In [97]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91      1774
           1       0.62      0.71      0.66       414

    accuracy                           0.86      2188
   macro avg       0.78      0.80      0.79      2188
weighted avg       0.87      0.86      0.87      2188



In [98]:
with open('/Users/savinaysingh/Documents/SavinayUTS/iLab/Code/rf_classifier_sent.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)

In [None]:
# Save the model 

In [None]:
# without hashtag seperation

In [110]:
[t for t in data.text if '#' in t] 

['Images showing the havoc caused by the #Cameroon military as they torched houses in #Oku.The shameless military is reported…',
 'Social media went bananas after Chuba Hubbard announced Monday evening his plans to return to #okstate. https://t.co/peN…',
 'Under #MamataBanerjee political violence &amp; vandalism continues to unabated in West Bengal! office in Asanol was…',
 'Images showing the havoc caused by the #Cameroon military as they torched houses in #Oku.The shameless military is… https://t.co/gIwZCH533D',
 'No cows today but our local factory is sadly still ablaze #REDJanuary2020 https://t.co/CMyuKzrcKz',
 'Rengoku sets my heart ablaze😔❤️🔥 P.s. I missed this style of coloring I do so here it is c: #鬼滅の刃 https://t.co/YrUF9g68s0',
 'French cameroun set houses ablaze in Ndu and roasted two young boys in their homes in #targeted killings in a #GenocideInSou…',
 "Cameroon's #BIR soldiers on the 05/01/2020 invaded the #SouthernCameroons Village of Kimar - So setting ablaze a total o

In [111]:
# -> just remove from text

'Images showing the havoc caused by the  military as they torched houses in .The shameless military is reported…'

In [10]:
def preprocess_data(data):
    """
    Preprocesses a DataFrame containing text data by applying various cleaning and text processing steps.
    
    Args:
    data (DataFrame): The input DataFrame containing text data.
    
    Returns:
    DataFrame: The preprocessed DataFrame with cleaned text data.
    """

    print("1. Capturing Hashtags")
    # 1. Capturing Hashtags
    pattern = r'#\w+'
    data['hashtags'] = data['text'].str.findall(pattern)
    data.hashtags=['' if [] == d else ' '.join([remove_special_characters(w) for w in d]) for d in data.hashtags]
    # 2. Handling Apostrophes
    print("2. Handling Apostrophes")
    sentences = data['text']
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (before handling): ",number_sent_apostophe/len(sentences)*100,"%")
    sentences = [decontracted(x) for x in sentences]
    data['text'] = sentences
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (after handling): ",number_sent_apostophe/len(sentences)*100,"%")

    # 3. Lowercasing
    print("3. Lowercasing the sentences")
    data['text'] = [entry.lower() for entry in sentences]

    # 4. Removing Emojis, URLs, Hashtags, and Special Characters
    print("4. Removing Emojis, URLs, Hashtags, and Special Characters")
    data['text'] = data['text'].apply(preprocess_text)
    
    
    # 5. Handling Numerics
    print("5. Handling Numerics")
    p = inflect.engine()
    sentences = list(data.text)
    data.text = sentences
    final_ans = []
    for i, sent in tqdm(enumerate(sentences), total=len(sentences)):
        list_numbers = re.findall(r'[0-9]+', sent)
        list_numbers = [int(l) for l in list_numbers]
        list_words = [p.number_to_words(l) for l in list_numbers]
        dict_NW = dict(zip(list_numbers, list_words))
        myKeys = list(dict_NW.keys())
        myKeys.sort(reverse=True)
        sorted_dict = {i: dict_NW[i] for i in myKeys}
        ans = copy.copy(sent)
        for key in sorted_dict.keys():
            ans = re.sub(str(key), ' ' + sorted_dict[key] + ' ', ans)
        final_ans.append(ans)
    sentences = final_ans.copy()
    data.text = sentences
    data['text'] = data['text'].apply(removeDash)

    # 6. Removing Duplicate Tweets
    print("6. Removing Duplicate Tweets")
    data = data.drop_duplicates(subset=['text'])
    sentences=data.text
    # 7. Lemmatization
    print("7. Performing Lemmatization")
    ans = []
    for x in tqdm(sentences, total=len(sentences)):
        ans.append(lemmatize_text(x))
    sentences = ans.copy()
    data.text = sentences

    # 8. Stopwords Removal
    print("8. Removing Stopwords")
    stop_words = stopwords.words('english')
    sentences = [' '.join([word for word in x.split() if word not in stop_words]) for x in sentences]
    data.text = sentences
    
    print("Done")
    return data

In [11]:
data=preprocess_data(data)

1. Capturing Hashtags
2. Handling Apostrophes
Percentage of sentences with apostophe is (before handling):  16.710642040457344 %
Percentage of sentences with apostophe is (after handling):  8.988566402814424 %
3. Lowercasing the sentences
4. Removing Emojis, URLs, Hashtags, and Special Characters
5. Handling Numerics


100%|██████████████████████████████████| 11370/11370 [00:00<00:00, 85656.00it/s]


6. Removing Duplicate Tweets
7. Performing Lemmatization


100%|████████████████████████████████████| 10938/10938 [00:17<00:00, 624.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


8. Removing Stopwords
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


# 2. Division B/w Test and train

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[['text','hashtags']], data.target, test_size=0.2, random_state=42)

# AMALGAMATION OF STATISTICAL AND CONTEXTUAL ANALYSIS

In [112]:
import numpy as np
import pandas as pd
import re
import emoji
import inflect
from tqdm import tqdm
import copy
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import pickle
from sentence_transformers import SentenceTransformer,util
from sentence_transformers import SentencesDataset, LoggingHandler, losses
from sklearn.metrics import classification_report


In [121]:
# Converting the words having apostophe into their root form
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"there\'s", "there is", phrase)
    phrase = re.sub(r"it\'s", "it is", phrase)
    phrase = re.sub(r"he\'s", "he is", phrase)
    phrase = re.sub(r"she\'s", "she is", phrase)
    phrase = re.sub(r"how\'s", "how is", phrase)
    phrase = re.sub(r"let\'s", "let is", phrase)
    phrase = re.sub(r"so\'s", "so is", phrase)
    phrase = re.sub(r"what\'s", "what is", phrase)
    phrase = re.sub(r"when\'s", "when is", phrase)
    phrase = re.sub(r"where\'s", "where is", phrase)
    phrase = re.sub(r"why\'s", "why is", phrase)
  
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [122]:
# De-emojize the text
def remove_emojis(text):
    return emoji.demojize(text)

# url not req while finding if text is disaster or not
def remove_urls(text):
    pattern = re.compile(r'http\S+|www\S+')
    return pattern.sub('', text)

def remove_hashtags(text):
    pattern = re.compile(r'#\w+')
    return pattern.sub('', text)

# spl chars not req while finding if text is disaster or not
def remove_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    return pattern.sub('', text)

In [123]:
def preprocess_text(text):
    text = remove_urls(text)
    # text = remove_hashtags(text)
    text = remove_special_characters(text)
    text = remove_emojis(text)
    return text

In [124]:
def removeDash(input_string):
        return re.sub(r'-', ' ', input_string)

In [125]:
def lemmatize_text(preprocessed_text):
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        # print(tag)
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
   # 2. Lemmatize a Sentence with the appropriate POS tag
    lemmatized_text=[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(preprocessed_text)]
    # The final processed set of words for each iteration will be stored in 'text_final'
    text_final=(" ".join( lemmatized_text ))
    return text_final

In [126]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

# 1. Data Cleaning for both

In [127]:
data = pd.read_csv('/Users/savinaysingh/Downloads/tweets 2.csv')


In [128]:
data = data[['text','target']]

In [129]:
def preprocess_data(data):
    """
    Preprocesses a DataFrame containing text data by applying various cleaning and text processing steps.
    
    Args:
    data (DataFrame): The input DataFrame containing text data.
    
    Returns:
    DataFrame: The preprocessed DataFrame with cleaned text data.
    """

    print("1. Capturing Hashtags")
    # 1. Capturing Hashtags
    pattern = r'#\w+'
    data['hashtags'] = data['text'].str.findall(pattern)
    data.hashtags=['' if [] == d else ' '.join([remove_special_characters(w) for w in d]) for d in data.hashtags]
    # 2. Handling Apostrophes
    print("2. Handling Apostrophes")
    sentences = data['text']
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (before handling): ",number_sent_apostophe/len(sentences)*100,"%")
    sentences = [decontracted(x) for x in sentences]
    data['text'] = sentences
    number_sent_apostophe=len([x for x in sentences if "'" in x])
    print("Percentage of sentences with apostophe is (after handling): ",number_sent_apostophe/len(sentences)*100,"%")

    # 3. Lowercasing
    print("3. Lowercasing the sentences")
    data['text'] = [entry.lower() for entry in sentences]

    # 4. Removing Emojis, URLs, Hashtags, and Special Characters
    print("4. Removing Emojis, URLs, Hashtags, and Special Characters")
    data['text'] = data['text'].apply(preprocess_text)
    
    
    # 5. Handling Numerics
    print("5. Handling Numerics")
    p = inflect.engine()
    sentences = list(data.text)
    data.text = sentences
    final_ans = []
    for i, sent in tqdm(enumerate(sentences), total=len(sentences)):
        list_numbers = re.findall(r'[0-9]+', sent)
        list_numbers = [int(l) for l in list_numbers]
        list_words = [p.number_to_words(l) for l in list_numbers]
        dict_NW = dict(zip(list_numbers, list_words))
        myKeys = list(dict_NW.keys())
        myKeys.sort(reverse=True)
        sorted_dict = {i: dict_NW[i] for i in myKeys}
        ans = copy.copy(sent)
        for key in sorted_dict.keys():
            ans = re.sub(str(key), ' ' + sorted_dict[key] + ' ', ans)
        final_ans.append(ans)
    sentences = final_ans.copy()
    data.text = sentences
    data['text'] = data['text'].apply(removeDash)

    # 6. Removing Duplicate Tweets
    print("6. Removing Duplicate Tweets")
    data = data.drop_duplicates(subset=['text'])
    sentences=data.text
    # 7. Lemmatization
    print("7. Performing Lemmatization")
    ans = []
    for x in tqdm(sentences, total=len(sentences)):
        ans.append(lemmatize_text(x))
    sentences = ans.copy()
    data.text = sentences

    # 8. Stopwords Removal
    print("8. Removing Stopwords")
    stop_words = stopwords.words('english')
    sentences = [' '.join([word for word in x.split() if word not in stop_words]) for x in sentences]
    data.text = sentences
    
    print("Done")
    return data

In [130]:
data=preprocess_data(data)

1. Capturing Hashtags
2. Handling Apostrophes
Percentage of sentences with apostophe is (before handling):  16.710642040457344 %
Percentage of sentences with apostophe is (after handling):  8.988566402814424 %
3. Lowercasing the sentences
4. Removing Emojis, URLs, Hashtags, and Special Characters
5. Handling Numerics


100%|██████████████████████████████████| 11370/11370 [00:00<00:00, 81561.47it/s]


6. Removing Duplicate Tweets
7. Performing Lemmatization


100%|████████████████████████████████████| 10957/10957 [00:17<00:00, 633.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


8. Removing Stopwords
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.text = sentences


# 2. Division B/w Test and train

In [131]:
X_train, X_test, y_train, y_test = train_test_split(data[['text','hashtags']], data.target, test_size=0.2, random_state=42)

# 3.1. Vectorization for Statistical

In [132]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer2 = TfidfVectorizer(ngram_range=(1,3))

In [147]:
matrix2 = vectorizer2.fit_transform(X_train['hashtags'])
countv_train2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())

In [134]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [150]:
X_train_ST=model.encode(list(X_train.text))

In [151]:
X_test_ST=model.encode(list(X_test.text))

In [152]:
X_train_ST.shape

(8765, 768)

In [148]:
countv_train2.shape

(8765, 2582)

(8765, 2)

In [155]:
matrix2 = vectorizer2.transform(X_test['hashtags'])
countv_test2=pd.DataFrame(matrix2.toarray(),columns=vectorizer2.get_feature_names_out())


In [154]:
X_train_ST_x=np.concatenate((X_train_ST, countv_train2), axis=1)

In [156]:
X_test_ST_x=np.concatenate((X_test_ST, countv_test2), axis=1)

In [157]:
X_test_ST_x.shape

(2192, 3350)

In [158]:
X_train_ST_x.shape

(8765, 3350)

In [160]:
X_train_ST_x_sm, y_train_ST_x_sm = smote.fit_resample(X_train_ST_x, y_train)

In [162]:
X_train_ST_x_sm.shape

(14420, 3350)

In [163]:
rf_classifier=RandomForestClassifier(n_estimators=100, random_state=42,class_weight={0:1,1:1})

In [165]:
rf_classifier.fit(X_train_ST_x_sm, y_train_ST_x_sm)

In [166]:
y_pred=rf_classifier.predict(X_test_ST_x)

In [167]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1794
           1       0.72      0.60      0.65       398

    accuracy                           0.88      2192
   macro avg       0.82      0.78      0.79      2192
weighted avg       0.88      0.88      0.88      2192



In [168]:
rf_classifier=RandomForestClassifier(n_estimators=100, random_state=42,class_weight={0:3,1:1})

In [169]:
rf_classifier.fit(X_train_ST_x_sm, y_train_ST_x_sm)

In [170]:
y_pred=rf_classifier.predict(X_test_ST_x)

In [171]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1794
           1       0.65      0.70      0.67       398

    accuracy                           0.88      2192
   macro avg       0.79      0.81      0.80      2192
weighted avg       0.88      0.88      0.88      2192



In [172]:
rf_classifier=RandomForestClassifier(n_estimators=100, random_state=42,class_weight={0:5,1:1})

In [173]:
rf_classifier.fit(X_train_ST_x_sm, y_train_ST_x_sm)

In [174]:
y_pred=rf_classifier.predict(X_test_ST_x)

In [175]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1794
           1       0.63      0.73      0.68       398

    accuracy                           0.87      2192
   macro avg       0.79      0.82      0.80      2192
weighted avg       0.88      0.87      0.88      2192

