In [1]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\assma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\assma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd

In [4]:
merged_df = pd.read_csv('../data/emotion_merged.csv')

In [5]:
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Rejoin words into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [7]:
def preprocess_text_safe(text):
    try:
        words = text.split()  # Split text into words
        cleaned_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
        return ' '.join(cleaned_words)
    except Exception as e:
        print(f"Error processing text: {e}")
        return text  # Return original text in case of an error

# Apply the function to the 'Text' column
merged_df['Cleaned_Text'] = merged_df['Text'].apply(preprocess_text_safe)

In [13]:
merged_df[['Text', 'Cleaned_Text']].head(10)

Unnamed: 0,Text,Cleaned_Text
0,Why ?,?
1,Sage Act upgrade on my to do list for tommorow.,Sage Act upgrade list tommorow.
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...
3,Such an eye ! The true hazel eye-and so brill...,eye ! true hazel eye-and brilliant ! Regular f...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,@Iluvmiasantos ugh babe.. hugggzzz u .! babe n...
5,I'm expecting an extremely important phonecall...,I'm expecting extremely important phonecall mi...
6,.Couldnt wait to see them live. If missing th...,.Couldnt wait see live. missing NH7 wasnt pain...
7,maken Tip 2: Stop op een moment dat je het hel...,maken Tip 2: Stop op een moment dat je het hel...
8,En dan krijg je ff een cadeautje van een tweep...,En dan krijg je ff een cadeautje van een tweep...
9,@1116am Drummer Boy bij op verzoek van @BiemO...,@1116am Drummer Boy bij op verzoek van @BiemOo...


In [15]:
merged_df.to_csv('emotion_with_cleaned_text.csv', index=False)

# Display the first few rows to verify the column is added and saved correctly
merged_df[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
0,Why ?,?
1,Sage Act upgrade on my to do list for tommorow.,Sage Act upgrade list tommorow.
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...
3,Such an eye ! The true hazel eye-and so brill...,eye ! true hazel eye-and brilliant ! Regular f...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,@Iluvmiasantos ugh babe.. hugggzzz u .! babe n...


In [16]:
import nltk
from nltk.corpus import words
import re

# Download necessary resources
nltk.download('words')

# Get a set of valid English words
english_words = set(words.words())


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\assma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


In [17]:

# Function to remove non-English words from the cleaned text
def remove_non_english_words(text):
    # Tokenize the text
    words_in_text = text.split()

    # Filter words that are in the English word set
    filtered_words = [word for word in words_in_text if word in english_words]

    # Rejoin the words into a single string
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

In [18]:
# Apply the function to the 'Cleaned_Text' column
merged_df['Cleaned_Text'] = merged_df['Cleaned_Text'].apply(remove_non_english_words)

In [19]:
# Save the updated DataFrame to a new CSV file
merged_df.to_csv('emotion_with_cleaned_text_no_non_english.csv', index=False)

In [20]:
# Display the first few rows to verify
merged_df[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
0,Why ?,
1,Sage Act upgrade on my to do list for tommorow.,upgrade list
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,
3,Such an eye ! The true hazel eye-and so brill...,eye true hazel brilliant open countenance comp...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh u babe ako e babe despite mas ko


In [21]:
merged_clean_df=pd.read_csv('../data/emotion_with_cleaned_text_no_non_english.csv')

In [22]:
import string

# Function to remove punctuation from text
def remove_punctuation(text):
    # Create a translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    # Apply translation to remove punctuation
    return text.translate(translator)

# Apply the function to the 'Cleaned_Text' column
merged_df['Cleaned_Text'] = merged_df['Cleaned_Text'].apply(remove_punctuation)

# Save the updated DataFrame to a new CSV file
merged_df.to_csv('emotion_with_cleaned_text_no_punctuation.csv', index=False)

# Display the first few rows to verify
merged_df[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
0,Why ?,
1,Sage Act upgrade on my to do list for tommorow.,upgrade list
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,
3,Such an eye ! The true hazel eye-and so brill...,eye true hazel brilliant open countenance comp...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh u babe ako e babe despite mas ko


In [25]:
merged_df_1=pd.read_csv('../data/emotion_with_cleaned_text_no_punctuation.csv')

In [27]:
# Remove rows where 'Cleaned_Text' is empty or contains only whitespace
merged_df_2 = merged_df_1[merged_df_1['Cleaned_Text'].str.strip() != '']

# Save the updated DataFrame to a new CSV file
merged_df_2.to_csv('emotion_with_cleaned_text_no_blank.csv', index=False)

# Display the first few rows to verify
merged_df_2[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
0,Why ?,
1,Sage Act upgrade on my to do list for tommorow.,upgrade list
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,
3,Such an eye ! The true hazel eye-and so brill...,eye true hazel brilliant open countenance comp...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh u babe ako e babe despite mas ko


In [28]:
# Remove rows where 'Cleaned_Text' is empty, contains only whitespace, or is NaN
merged_df_2 = merged_df_1[merged_df_1['Cleaned_Text'].str.strip() != '']
merged_df_2 = merged_df_2.dropna(subset=['Cleaned_Text'])

# Save the updated DataFrame to a new CSV file
merged_df_2.to_csv('emotion_with_cleaned_text_no_blank_or_nan.csv', index=False)

# Display the first few rows to verify
merged_df_2[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
1,Sage Act upgrade on my to do list for tommorow.,upgrade list
3,Such an eye ! The true hazel eye-and so brill...,eye true hazel brilliant open countenance comp...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh u babe ako e babe despite mas ko
5,I'm expecting an extremely important phonecall...,extremely important minute
6,.Couldnt wait to see them live. If missing th...,wait see missing wasnt painful last gig


<h3>Tokenization</h3>

In [67]:
df10=pd.read_csv('../data/emotion_with_cleaned_text_no_blank_or_nan.csv')

In [68]:
df10['tokens']=df10['Cleaned_Text'].str.split()

In [69]:
print(df10[['Cleaned_Text','tokens']].head())

                                        Cleaned_Text  \
0                                       upgrade list   
1  eye true hazel brilliant open countenance comp...   
2               ugh u babe ako e babe despite mas ko   
3                         extremely important minute   
4            wait see missing wasnt painful last gig   

                                              tokens  
0                                    [upgrade, list]  
1  [eye, true, hazel, brilliant, open, countenanc...  
2     [ugh, u, babe, ako, e, babe, despite, mas, ko]  
3                     [extremely, important, minute]  
4    [wait, see, missing, wasnt, painful, last, gig]  


In [70]:
df10.to_csv('emotion_with_tokens.csv', index=False)

In [71]:
df11=pd.read_csv('../data/emotion_with_tokens.csv')

In [72]:
import re

In [73]:
# Function to check for digits or non-word characters
def check_non_word(text):
    if bool(re.search(r'\d', text)):  # Check for digits
        return True
    elif bool(re.search(r'[^\w\s]', text)):  # Check for non-word characters or punctuation
        return True
    else:
        return False

In [74]:
df11['contains_digits_or_non_word'] = df11['Cleaned_Text'].apply(check_non_word)

In [75]:
# Apply the function to the 'Cleaned_Text' column and filter rows that match the condition
rows_with_non_word = df10[df10['Cleaned_Text'].apply(check_non_word)]

In [76]:
print(rows_with_non_word)

Empty DataFrame
Columns: [Tweet_id, Emotion, Text, Cleaned_Text, tokens]
Index: []


<h3>Lemmatization</h3>

In [77]:
import nltk
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources if not already done
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\assma\AppData\Roaming\nltk_data...


In [None]:
# Apply lemmatization to each token in the 'tokens' column
df11['lemmatized_tokens'] = df11['Tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Optionally, join the lemmatized tokens into a single string if needed
df11['lemmatized_text'] = df11['lemmatized_tokens'].apply(lambda x: ' '.join(x))


In [89]:
# Display the updated DataFrame to check the results
print(df11[['Cleaned_Text', 'Tokens', 'lemmatized_text']].head())

                                        Cleaned_Text  \
0                                       upgrade list   
1  eye true hazel brilliant open countenance comp...   
2               ugh u babe ako e babe despite mas ko   
3                         extremely important minute   
4            wait see missing wasnt painful last gig   

                                              Tokens  \
0                                ['upgrade', 'list']   
1  ['eye', 'true', 'hazel', 'brilliant', 'open', ...   
2  ['ugh', 'u', 'babe', 'ako', 'e', 'babe', 'desp...   
3               ['extremely', 'important', 'minute']   
4  ['wait', 'see', 'missing', 'wasnt', 'painful',...   

                                     lemmatized_text  
0              [ ' u p g r a d e ' ,   ' l i s t ' ]  
1  [ ' e y e ' ,   ' t r u e ' ,   ' h a z e l ' ...  
2  [ ' u g h ' ,   ' u ' ,   ' b a b e ' ,   ' a ...  
3  [ ' e x t r e m e l y ' ,   ' i m p o r t a n ...  
4  [ ' w a i t ' ,   ' s e e ' ,   ' m i s s i n ..

In [88]:
# Save the result to a new CSV file
df11.to_csv('emotion_with_lemmatized_tokens.csv', index=False)

In [90]:
df11['lemmatized_text1'] = df11['lemmatized_text'].apply(lambda x: ' '.join(x.split()))

In [92]:
print(df11[['Cleaned_Text', 'Tokens', 'lemmatized_text1']].head())

                                        Cleaned_Text  \
0                                       upgrade list   
1  eye true hazel brilliant open countenance comp...   
2               ugh u babe ako e babe despite mas ko   
3                         extremely important minute   
4            wait see missing wasnt painful last gig   

                                              Tokens  \
0                                ['upgrade', 'list']   
1  ['eye', 'true', 'hazel', 'brilliant', 'open', ...   
2  ['ugh', 'u', 'babe', 'ako', 'e', 'babe', 'desp...   
3               ['extremely', 'important', 'minute']   
4  ['wait', 'see', 'missing', 'wasnt', 'painful',...   

                                    lemmatized_text1  
0                [ ' u p g r a d e ' , ' l i s t ' ]  
1  [ ' e y e ' , ' t r u e ' , ' h a z e l ' , ' ...  
2  [ ' u g h ' , ' u ' , ' b a b e ' , ' a k o ' ...  
3  [ ' e x t r e m e l y ' , ' i m p o r t a n t ...  
4  [ ' w a i t ' , ' s e e ' , ' m i s s i n g ' ..

In [93]:
import re

def remove_extra_spaces(token):
    # Remove spaces between characters in the token
    cleaned_token = re.sub(r'\s+', '', token)
    return cleaned_token

In [94]:
df12=pd.read_csv('emotion_with_lemmatized_tokens.csv')

In [95]:
df12=df12.drop(['contains_digits_or_non_word', 'lemmatized_tokens', 'lemmatized_text'], axis=1)


In [96]:
df12.to_csv('emotion_without_specific_columns.csv', index=False)

In [97]:
df13=pd.read_csv('../data/emotion_with_tokens.csv')

In [99]:
import neattext.functions as nfx

# Remove the user handles
df13['Clean_Text'] = df13['Text'].apply(nfx.remove_userhandles)

In [100]:
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr

In [101]:
df13['Clean_Text'] = df13['Cleaned_Text'].apply(nfx.remove_stopwords)

In [102]:
df11.to_csv('emotion_with_tokens_1.csv', index=False)

In [103]:
print(df13[['Cleaned_Text', 'Tokens', 'Clean_Text']].head())

                                        Cleaned_Text  \
0                                       upgrade list   
1  eye true hazel brilliant open countenance comp...   
2               ugh u babe ako e babe despite mas ko   
3                         extremely important minute   
4            wait see missing wasnt painful last gig   

                                              Tokens  \
0                                ['upgrade', 'list']   
1  ['eye', 'true', 'hazel', 'brilliant', 'open', ...   
2  ['ugh', 'u', 'babe', 'ako', 'e', 'babe', 'desp...   
3               ['extremely', 'important', 'minute']   
4  ['wait', 'see', 'missing', 'wasnt', 'painful',...   

                                          Clean_Text  
0                                       upgrade list  
1  eye true hazel brilliant open countenance comp...  
2               ugh u babe ako e babe despite mas ko  
3                         extremely important minute  
4                     wait missing wasnt painful gi

In [112]:

x = df13['Tokens']
y = df13['Emotion']

<h1>Training the model<h1>

In [114]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [115]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [119]:
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression(max_iter=5000))])
pipe_lr.fit(x_train,y_train)
pipe_lr.score(x_test,y_test)

0.3506334581053844

In [120]:
pipe_svm = Pipeline(steps=[('cv',CountVectorizer()),('svc', SVC(kernel = 'rbf', C = 10))])
pipe_svm.fit(x_train,y_train)
pipe_svm.score(x_test,y_test)

0.338756118629427

In [121]:
pipe_rf = Pipeline(steps=[('cv',CountVectorizer()),('rf', RandomForestClassifier(n_estimators=10))])
pipe_rf.fit(x_train,y_train)
pipe_rf.score(x_test,y_test)

0.3074431327382666

In [122]:
import joblib
pipeline_file = open("text_emotion.pkl","wb")
joblib.dump(pipe_lr,pipeline_file)
pipeline_file.close()