In [2]:
import pandas as pd

df1 = pd.read_csv('train.csv',encoding='latin1')
df2 = pd.read_csv('test.csv',encoding='latin1')

# Merge the DataFrames
train_data = pd.concat([df1, df2], ignore_index=True)

# Write the merged DataFrame to a new CSV file
train_data.to_csv('merged_file.csv', index=False)
print(train_data.head())


       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan        38928346.0         652860.0    

Removing the Unnecessary columns

In [3]:
columns_to_remove = ['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
train_data.drop(columns=columns_to_remove, inplace=True)


In [4]:
train_data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
32291,,
32292,,
32293,,
32294,,


In [5]:
# Check for missing values
missing_values = train_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Check for duplicates
duplicate_rows = train_data.duplicated().sum()
print("\nDuplicate Rows:", duplicate_rows)

Missing Values:
 text         1282
sentiment    1281
dtype: int64

Duplicate Rows: 1280


In [6]:
train_data.dropna(subset=['text'], inplace=True)
# train_data.dropna(subset=['text_lower'],inplace=True)


In [7]:
# Lowercase Conversion
train_data['text_lower'] = train_data['text'].str.lower()


In [8]:
train_data

Unnamed: 0,text,sentiment,text_lower
0,"I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad i will miss you here in san diego!!!
2,my boss is bullying me...,negative,my boss is bullying me...
3,what interview! leave me alone,negative,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,"sons of ****, why couldn`t they put them on t..."
...,...,...,...
31010,"its at 3 am, im very tired but i can`t sleep ...",negative,"its at 3 am, im very tired but i can`t sleep ..."
31011,All alone in this old house again. Thanks for...,positive,all alone in this old house again. thanks for...
31012,I know what you mean. My little dog is sinkin...,negative,i know what you mean. my little dog is sinkin...
31013,_sutra what is your next youtube video gonna b...,positive,_sutra what is your next youtube video gonna b...


*****Text cleaning***

In [9]:
import re

def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Remove special characters, HTML tags, and links
        cleaned_text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
        cleaned_text = re.sub(r"http\S+|www\.\S+", "", cleaned_text)  # Remove links
        cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)  # Remove special characters
        return cleaned_text.lower()  # Convert text to lowercase
    else:
        return text  # Return unchanged if not a string

# Apply text cleaning to 'text' column
train_data['text'] = train_data['text'].apply(clean_text)


**Tokenization: Split the text into individual words or tokens for further analysis.**

In [10]:
def tokenize_text(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as the delimiter
        tokens = text.split()
        return tokens
    else:
        return []

# Applying the tokenization function to the 'text' column in the train_data DataFrame
train_data['tokens'] = train_data['text'].apply(tokenize_text)


**Stopwords Removal: Remove common stopwords while preserving the links.**

In [11]:
import requests

# Download the stopwords file
url = "https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Extract stopwords from the content
    stopwords = response.text.split(",")
else:
    print("Failed to download stopwords file.")

# Stopwords removal function
def remove_stopwords(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as delimiter
        tokens = text.split()
        # Remove stopwords from the tokens
        filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
        # Join the filtered tokens back into a string
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    else:
        return text





# Applying the stopwords removal function to the 'text' column in the train_data DataFrame
train_data['text_without_stopwords'] = train_data['text'].apply(remove_stopwords)


In [12]:
train_data

Unnamed: 0,text,sentiment,text_lower,tokens,text_without_stopwords
0,id have responded if i were going,neutral,"i`d have responded, if i were going","[id, have, responded, if, i, were, going]",responded
1,sooo sad i will miss you here in san diego,negative,sooo sad i will miss you here in san diego!!!,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad san diego
2,my boss is bullying me,negative,my boss is bullying me...,"[my, boss, is, bullying, me]",boss bullying
3,what interview leave me alone,negative,what interview! leave me alone,"[what, interview, leave, me, alone]",interview leave
4,sons of why couldnt they put them on the rel...,negative,"sons of ****, why couldn`t they put them on t...","[sons, of, why, couldnt, they, put, them, on, ...",sons releases bought
...,...,...,...,...,...
31010,its at 3 am im very tired but i cant sleep bu...,negative,"its at 3 am, im very tired but i can`t sleep ...","[its, at, 3, am, im, very, tired, but, i, cant...",3 tired sleep
31011,all alone in this old house again thanks for ...,positive,all alone in this old house again. thanks for...,"[all, alone, in, this, old, house, again, than...",house net alive kicking invented net wanna kis...
31012,i know what you mean my little dog is sinking...,negative,i know what you mean. my little dog is sinkin...,"[i, know, what, you, mean, my, little, dog, is...",dog sinking depression someplace tropical
31013,sutra what is your next youtube video gonna be...,positive,_sutra what is your next youtube video gonna b...,"[sutra, what, is, your, next, youtube, video, ...",sutra youtube video gonna love videos


In [13]:
from sklearn.preprocessing import LabelEncoder

# Encode the sentiment labels
label_encoder = LabelEncoder()
train_data['sentiment_encoded'] = label_encoder.fit_transform(train_data['sentiment'])


In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text_without_stopwords'],train_data['sentiment_encoded'], test_size=0.2, random_state=42)

# Step 2: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# # Step 3: Train the SVM classifier
# svm_classifier = SVC(kernel='linear')
# svm_classifier.fit(X_train_tfidf, y_train)

# # Step 4: Predict sentiment on the test set
# y_pred = svm_classifier.predict(X_test_tfidf)

# # Step 5: Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))


In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Step 4: Define the architecture of the ANN
model = Sequential()
model.add(Dense(128, input_shape=(X_train_tfidf.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Step 5: Compile the ANN
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 6: Train the ANN on the training data
history = model.fit(X_train_tfidf, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Step 7: Evaluate the performance of the ANN on the testing data
loss, accuracy = model.evaluate(X_test_tfidf, y_test)
print("Accuracy:", accuracy)


Epoch 1/10


ValueError: in user code:

    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\losses.py", line 2162, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "c:\Users\91798\anaconda3\lib\site-packages\keras\backend.py", line 5677, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 10) vs (None, 1)).


**For improving accuracy**

In [None]:
# Re-split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text_without_stopwords'], train_data['sentiment_encoded'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Tune hyperparameters of the SVM classifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf', 'sigmoid']}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(SVC(), param_grid, cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best parameters to train the SVM classifier
best_svm_classifier = SVC(**best_params)
best_svm_classifier.fit(X_train_tfidf, y_train)

# Predict sentiment on the test set
y_pred = best_svm_classifier.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.6595195872964694
              precision    recall  f1-score   support

           0       0.72      0.54      0.62      1749
           1       0.58      0.77      0.67      2502
           2       0.76      0.62      0.69      1952

    accuracy                           0.66      6203
   macro avg       0.69      0.64      0.66      6203
weighted avg       0.68      0.66      0.66      6203

