In [3]:
import numpy as np
import pandas as pd
train_data=pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1')
train_data

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


Removing the Unnecessary columns

In [4]:
columns_to_remove = ['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
train_data.drop(columns=columns_to_remove, inplace=True)


In [5]:
train_data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


**Check for missing value**

In [6]:
# Check for missing values
missing_values = train_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Check for duplicates
duplicate_rows = train_data.duplicated().sum()
print("\nDuplicate Rows:", duplicate_rows)


Missing Values:
 text         1
sentiment    0
dtype: int64

Duplicate Rows: 0


**since we have a rows which contains text column missing so we will remove it**

In [7]:
train_data.dropna(subset=['text'], inplace=True)


*****Text cleaning***

In [8]:
import re

def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Remove special characters and HTML tags (except for links)
        cleaned_text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
        cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)  # Remove special characters
        return cleaned_text.lower()  # Convert text to lowercase
    else:
        return text  # Return unchanged if not a string

# Apply text cleaning to 'text' column
train_data['text'] = train_data['text'].apply(clean_text)


**Tokenization: Split the text into individual words or tokens for further analysis.**

In [10]:
def tokenize_text(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as the delimiter
        tokens = text.split()
        return tokens
    else:
        return []

# Applying the tokenization function to the 'text' column in the train_data DataFrame
train_data['tokens'] = train_data['text'].apply(tokenize_text)


**Stopwords Removal: Remove common stopwords while preserving the links.**

In [21]:
import requests

# Download the stopwords file
url = "https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Extract stopwords from the content
    stopwords = response.text.split(",")
else:
    print("Failed to download stopwords file.")

# Stopwords removal function
def remove_stopwords(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as delimiter
        tokens = text.split()
        # Remove stopwords from the tokens
        filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
        # Join the filtered tokens back into a string
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    else:
        return text





# Applying the stopwords removal function to the 'text' column in the train_data DataFrame
train_data['text_without_stopwords'] = train_data['text'].apply(remove_stopwords)
print(train_data.head())


                                                text sentiment  \
0                  id have responded if i were going   neutral   
1         sooo sad i will miss you here in san diego  negative   
2                             my boss is bullying me  negative   
3                      what interview leave me alone  negative   
4   sons of  why couldnt they put them on the rel...  negative   

                                              tokens text_without_stopwords  \
0          [id, have, responded, if, i, were, going]              responded   
1  [sooo, sad, i, will, miss, you, here, in, san,...     sooo sad san diego   
2                       [my, boss, is, bullying, me]          boss bullying   
3                [what, interview, leave, me, alone]        interview leave   
4  [sons, of, why, couldnt, they, put, them, on, ...   sons releases bought   

                                       text_features  word_count  char_count  \
0  {'word_count': 7, 'char_count': 34, 'has_link

**Stemming or Lemmatization: Reduce words to their base or root form.**

In [22]:
def stem_word(word):
    # Define stemming rules
    suffixes = {
        's': '',       # Remove plural 's'
        'es': '',      # Remove plural 'es'
        'ies': 'y',    # Replace 'ies' with 'y'
        'ed': '',      # Remove past tense 'ed'
        'ing': '',     # Remove present participle 'ing'
        'ly': '',      # Remove adverb suffix 'ly'
        'er': '',      # Remove comparative suffix 'er'
        'est': '',     # Remove superlative suffix 'est'
        'y': 'i',      # Replace 'y' with 'i' if preceded by a consonant
        'ation': 'ate',  # Replace 'ation' with 'ate'
        'tion': 'te',    # Replace 'tion' with 'te'
        'er': '',        # Remove comparative suffix 'er'
        'est': '',       # Remove superlative suffix 'est'
        'ment': '',      # Remove suffix 'ment'
        'ness': '',      # Remove suffix 'ness'
        'ive': '',       # Remove suffix 'ive'
        'ful': '',       # Remove suffix 'ful'
        'ness': '',      # Remove suffix 'ness'
        'less': '',      # Remove suffix 'less'
        # Add more stemming rules as needed
    }
    
    for suffix in suffixes:
        if word.endswith(suffix):
            # Apply the stemming rule by removing the suffix or replacing characters
            if suffix == 'y' and len(word) > 1 and word[-2] not in 'aeiou':
                return word[:-len(suffix)] + suffixes[suffix]
            else:
                return word[:-len(suffix)] + suffixes[suffix]
    
    return word  # Return the word unchanged if no stemming rule applies

# Example usage
word = "running"
stemmed_word = stem_word(word)
print("Original Word:", word)
print("Stemmed Word:", stemmed_word)


Failed to download stemming rules file.


NameError: name 'suffixes' is not defined

In [17]:
import re

# Feature Engineering
def extract_features(text):
    features = {}

    if isinstance(text, str):  # Check if text is a string
        # Word count
        words = text.split()  # Split text into words
        features['word_count'] = len(words)

        # Character count
        features['char_count'] = len(text)

        # Presence of links
        features['has_links'] = 1 if re.search(r"http\S+|www\S+", text) else 0

    else:
        # Set features to None for non-string inputs
        features['word_count'] = None
        features['char_count'] = None
        features['has_links'] = None

    return features

# Apply feature extraction to 'text' column
train_data['text_features'] = train_data['text'].apply(extract_features)

# Convert the extracted features into DataFrame columns
df_features = pd.DataFrame(train_data['text_features'].tolist())

# Concatenate the extracted features DataFrame with the original DataFrame
train_data = pd.concat([train_data, df_features], axis=1)


train_data

Unnamed: 0,text,sentiment,tokens,text_without_stopwords,text_features,word_count,char_count,has_links,word_count.1,char_count.1,has_links.1
0,id have responded if i were going,neutral,"[id, have, responded, if, i, were, going]",responded,"{'word_count': 7, 'char_count': 34, 'has_links...",7.0,34.0,0.0,7.0,34.0,0.0
1,sooo sad i will miss you here in san diego,negative,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad san diego,"{'word_count': 10, 'char_count': 43, 'has_link...",10.0,43.0,0.0,10.0,43.0,0.0
2,my boss is bullying me,negative,"[my, boss, is, bullying, me]",boss bullying,"{'word_count': 5, 'char_count': 22, 'has_links...",5.0,22.0,0.0,5.0,22.0,0.0
3,what interview leave me alone,negative,"[what, interview, leave, me, alone]",interview leave,"{'word_count': 5, 'char_count': 30, 'has_links...",5.0,30.0,0.0,5.0,30.0,0.0
4,sons of why couldnt they put them on the rel...,negative,"[sons, of, why, couldnt, they, put, them, on, ...",sons releases bought,"{'word_count': 13, 'char_count': 69, 'has_link...",13.0,69.0,0.0,13.0,69.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
27477,ive wondered about rake to the client has ma...,negative,"[ive, wondered, about, rake, to, the, client, ...",ive wondered rake client clear net dont force ...,"{'word_count': 23, 'char_count': 115, 'has_lin...",21.0,109.0,0.0,21.0,109.0,0.0
27478,yay good for both of you enjoy the break you...,positive,"[yay, good, for, both, of, you, enjoy, the, br...",yay good enjoy break hectic weekend care hun xxxx,"{'word_count': 21, 'char_count': 109, 'has_lin...",5.0,22.0,0.0,5.0,22.0,0.0
27479,but it was worth it,positive,"[but, it, was, worth, it]",worth,"{'word_count': 5, 'char_count': 22, 'has_links...",10.0,55.0,0.0,10.0,55.0,0.0
27480,all this flirting going on the atg smiles ...,neutral,"[all, this, flirting, going, on, the, atg, smi...",flirting atg smiles yay hugs,"{'word_count': 10, 'char_count': 55, 'has_link...",,,,,,


**We have done this because we don't know why 27480 row number contain NaN althought it contains words**

In [None]:
train_data["word_count"][27480]= 10
train_data["char_count"][27480]= 55
train_data["has_links"][27480]= 0.0

In [14]:
train_data

Unnamed: 0,text,sentiment,tokens,text_without_stopwords,text_features,word_count,char_count,has_links
0,id have responded if i were going,neutral,"[id, have, responded, if, i, were, going]",id responded going,"{'word_count': 7, 'char_count': 34, 'has_links...",7.0,34.0,0.0
1,sooo sad i will miss you here in san diego,negative,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad miss san diego,"{'word_count': 10, 'char_count': 43, 'has_link...",10.0,43.0,0.0
2,my boss is bullying me,negative,"[my, boss, is, bullying, me]",boss bullying,"{'word_count': 5, 'char_count': 22, 'has_links...",5.0,22.0,0.0
3,what interview leave me alone,negative,"[what, interview, leave, me, alone]",interview leave alone,"{'word_count': 5, 'char_count': 30, 'has_links...",5.0,30.0,0.0
4,sons of why couldnt they put them on the rel...,negative,"[sons, of, why, couldnt, they, put, them, on, ...",sons couldnt put releases already bought,"{'word_count': 13, 'char_count': 69, 'has_link...",13.0,69.0,0.0
...,...,...,...,...,...,...,...,...
27477,ive wondered about rake to the client has ma...,negative,"[ive, wondered, about, rake, to, the, client, ...",ive wondered rake client made clear net dont f...,"{'word_count': 23, 'char_count': 115, 'has_lin...",21.0,109.0,0.0
27478,yay good for both of you enjoy the break you...,positive,"[yay, good, for, both, of, you, enjoy, the, br...",yay good enjoy break probably need hectic week...,"{'word_count': 21, 'char_count': 109, 'has_lin...",5.0,22.0,0.0
27479,but it was worth it,positive,"[but, it, was, worth, it]",worth,"{'word_count': 5, 'char_count': 22, 'has_links...",10.0,55.0,0.0
27480,all this flirting going on the atg smiles ...,neutral,"[all, this, flirting, going, on, the, atg, smi...",flirting going atg smiles yay hugs,"{'word_count': 10, 'char_count': 55, 'has_link...",,,
