In [36]:
import pandas as pd

df1 = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1')
df2 = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1')

# Merge the DataFrames
train_data = pd.concat([df1, df2], ignore_index=True)

# Write the merged DataFrame to a new CSV file
train_data.to_csv('merged_file.csv', index=False)


Removing the Unnecessary columns

In [37]:
columns_to_remove = ['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
train_data.drop(columns=columns_to_remove, inplace=True)


In [38]:
train_data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
32291,,
32292,,
32293,,
32294,,


In [39]:
# Check for missing values
missing_values = train_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Check for duplicates
duplicate_rows = train_data.duplicated().sum()
print("\nDuplicate Rows:", duplicate_rows)

Missing Values:
 text         1282
sentiment    1281
dtype: int64

Duplicate Rows: 1280


In [40]:
train_data.dropna(subset=['text'], inplace=True)
# train_data.dropna(subset=['text_lower'],inplace=True)


In [41]:
# Lowercase Conversion
train_data['text_lower'] = train_data['text'].str.lower()


In [42]:
train_data

Unnamed: 0,text,sentiment,text_lower
0,"I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad i will miss you here in san diego!!!
2,my boss is bullying me...,negative,my boss is bullying me...
3,what interview! leave me alone,negative,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,"sons of ****, why couldn`t they put them on t..."
...,...,...,...
31010,"its at 3 am, im very tired but i can`t sleep ...",negative,"its at 3 am, im very tired but i can`t sleep ..."
31011,All alone in this old house again. Thanks for...,positive,all alone in this old house again. thanks for...
31012,I know what you mean. My little dog is sinkin...,negative,i know what you mean. my little dog is sinkin...
31013,_sutra what is your next youtube video gonna b...,positive,_sutra what is your next youtube video gonna b...


*****Text cleaning***

In [43]:
import re

def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Remove special characters, HTML tags, and links
        cleaned_text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
        cleaned_text = re.sub(r"http\S+|www\.\S+", "", cleaned_text)  # Remove links
        cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)  # Remove special characters
        return cleaned_text.lower()  # Convert text to lowercase
    else:
        return text  # Return unchanged if not a string

# Apply text cleaning to 'text' column
train_data['text'] = train_data['text'].apply(clean_text)


**Tokenization: Split the text into individual words or tokens for further analysis.**

In [44]:
def tokenize_text(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as the delimiter
        tokens = text.split()
        return tokens
    else:
        return []

# Applying the tokenization function to the 'text' column in the train_data DataFrame
train_data['tokens'] = train_data['text'].apply(tokenize_text)


**Stopwords Removal: Remove common stopwords while preserving the links.**

In [45]:
import requests

# Download the stopwords file
url = "https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Extract stopwords from the content
    stopwords = response.text.split(",")
else:
    print("Failed to download stopwords file.")

# Stopwords removal function
def remove_stopwords(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as delimiter
        tokens = text.split()
        # Remove stopwords from the tokens
        filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
        # Join the filtered tokens back into a string
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    else:
        return text





# Applying the stopwords removal function to the 'text' column in the train_data DataFrame
train_data['text_without_stopwords'] = train_data['text'].apply(remove_stopwords)


In [46]:
train_data

Unnamed: 0,text,sentiment,text_lower,tokens,text_without_stopwords
0,id have responded if i were going,neutral,"i`d have responded, if i were going","[id, have, responded, if, i, were, going]",responded
1,sooo sad i will miss you here in san diego,negative,sooo sad i will miss you here in san diego!!!,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad san diego
2,my boss is bullying me,negative,my boss is bullying me...,"[my, boss, is, bullying, me]",boss bullying
3,what interview leave me alone,negative,what interview! leave me alone,"[what, interview, leave, me, alone]",interview leave
4,sons of why couldnt they put them on the rel...,negative,"sons of ****, why couldn`t they put them on t...","[sons, of, why, couldnt, they, put, them, on, ...",sons releases bought
...,...,...,...,...,...
31010,its at 3 am im very tired but i cant sleep bu...,negative,"its at 3 am, im very tired but i can`t sleep ...","[its, at, 3, am, im, very, tired, but, i, cant...",3 tired sleep
31011,all alone in this old house again thanks for ...,positive,all alone in this old house again. thanks for...,"[all, alone, in, this, old, house, again, than...",house net alive kicking invented net wanna kis...
31012,i know what you mean my little dog is sinking...,negative,i know what you mean. my little dog is sinkin...,"[i, know, what, you, mean, my, little, dog, is...",dog sinking depression someplace tropical
31013,sutra what is your next youtube video gonna be...,positive,_sutra what is your next youtube video gonna b...,"[sutra, what, is, your, next, youtube, video, ...",sutra youtube video gonna love videos


In [52]:
from sklearn.preprocessing import LabelEncoder

# Encode the sentiment labels
label_encoder = LabelEncoder()
train_data['sentiment_encoded'] = label_encoder.fit_transform(train_data['sentiment'])


In [54]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have already encoded sentiment labels as 'sentiment_encoded'

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text_without_stopwords'],train_data['sentiment_encoded'], test_size=0.2, random_state=42)

# Step 2: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 3: Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Step 4: Predict sentiment on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Step 5: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6595195872964694
              precision    recall  f1-score   support

           0       0.72      0.54      0.62      1749
           1       0.58      0.77      0.67      2502
           2       0.76      0.62      0.69      1952

    accuracy                           0.66      6203
   macro avg       0.69      0.64      0.66      6203
weighted avg       0.68      0.66      0.66      6203



**For improving accuracy**

In [57]:
# Re-split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text_without_stopwords'], train_data['sentiment_encoded'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Tune hyperparameters of the SVM classifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf', 'sigmoid']}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(SVC(), param_grid, cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best parameters to train the SVM classifier
best_svm_classifier = SVC(**best_params)
best_svm_classifier.fit(X_train_tfidf, y_train)

# Predict sentiment on the test set
y_pred = best_svm_classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.6595195872964694
              precision    recall  f1-score   support

           0       0.72      0.54      0.62      1749
           1       0.58      0.77      0.67      2502
           2       0.76      0.62      0.69      1952

    accuracy                           0.66      6203
   macro avg       0.69      0.64      0.66      6203
weighted avg       0.68      0.66      0.66      6203

