# Machine Learning model for Sentimental Analysis of socia media posts

# Step 1: Importing Libraries

In [66]:
# Import important libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



# Step 2: Loading data

In [47]:
# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/Inter/Codingraja/ML Intern/Tasks/Task-1/sentiment_tweets3.csv")

# Step 3: Handling Missing Values

In [48]:

# Check for missing values in the entire dataframe
print("Missing values in each column:\n", df.isnull().sum())

# Drop rows with missing values in any column (if necessary)
df = df.dropna()

Missing values in each column:
 Index                        0
message to examine           0
label (depression result)    0
dtype: int64


# Step 4: Data Preprocessing

In [49]:
# Mapping numerical labels to sentiment strings
df['label (depression result)'] = df['label (depression result)'].map({0: 'negative', 1: 'positive'})


In [50]:
# Display the distribution of labels
print(df['label (depression result)'].value_counts())

label (depression result)
negative    8000
positive    2314
Name: count, dtype: int64


In [51]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return " ".join(filtered_tokens)

In [53]:
# Apply text preprocessing to the dataset
df['cleaned_text'] = df['message to examine'].apply(preprocess_text)

# Step 5: Feature Extraction

In [54]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [55]:
# Transform the text data
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [56]:
# Convert sentiment labels to numerical values
y = df['label (depression result)'].map({'positive': 1, 'negative': 0})

In [57]:
# Ensure no NaN values remain
y = y.dropna()

In [58]:
# Ensure the data type of y is integer
y = y.astype(int)

In [59]:
# Ensure X and y are aligned
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (10314, 5000)
Shape of y: (10314,)


# Step 6: Model Selection

In [60]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', probability=True)

# Step 7: Model Training

In [62]:
# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

In [63]:
# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Step 8: Model Evaluation and Analysis

---



In [64]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [65]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9917595734367426
Precision: 0.9918454638423682
Recall: 0.9917595734367426
F1 Score: 0.9917016538463619

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1614
           1       1.00      0.96      0.98       449

    accuracy                           0.99      2063
   macro avg       0.99      0.98      0.99      2063
weighted avg       0.99      0.99      0.99      2063

