<a href="https://colab.research.google.com/github/EmmanuelKnows/DS-Codveda/blob/main/Natural_Language_Processing_(NLP)_(Text_Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP) - Text Classification

## Import Required Libraries



In [6]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Load and Explore the Dataset

In [56]:
# Load the dataset
df = pd.read_csv('Sentiment dataset.csv')

# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())


Dataset Shape: (732, 15)

First few rows:
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! ðŸ’ª          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                      

In [57]:
# Check unique sentiments
print("\nUnique Sentiments:")
print(df['Sentiment'].unique())
print(f"\nNumber of unique sentiments: {df['Sentiment'].nunique()}")

# Distribution of sentiments
print("\nSentiment Distribution:")
sentiment_counts = df['Sentiment'].value_counts()
print(sentiment_counts)


Unique Sentiments:
[' Positive  ' ' Negative  ' ' Neutral   ' ' Anger        '
 ' Fear         ' ' Sadness      ' ' Disgust      ' ' Happiness    '
 ' Joy          ' ' Love         ' ' Amusement    ' ' Enjoyment    '
 ' Admiration   ' ' Affection    ' ' Awe          ' ' Disappointed '
 ' Surprise     ' ' Acceptance   ' ' Adoration    ' ' Anticipation '
 ' Bitter       ' ' Calmness     ' ' Confusion    ' ' Excitement   '
 ' Kind         ' ' Pride        ' ' Shame        ' ' Confusion '
 ' Excitement ' ' Shame ' ' Elation       ' ' Euphoria      '
 ' Contentment   ' ' Serenity      ' ' Gratitude     ' ' Hope          '
 ' Empowerment   ' ' Compassion    ' ' Tenderness    ' ' Arousal       '
 ' Enthusiasm    ' ' Fulfillment  ' ' Reverence     ' ' Compassion'
 ' Fulfillment   ' ' Reverence ' ' Elation   ' ' Despair         '
 ' Grief           ' ' Loneliness      ' ' Jealousy        '
 ' Resentment      ' ' Frustration     ' ' Boredom         '
 ' Anxiety         ' ' Intimidation    ' ' H

In [49]:
# Clean Sentiment Column

#df1 = df.copy()
df['Sentiment'] = df['Sentiment'].str.strip()

print("\nUnique Sentiments:")
print(df['Sentiment'].value_counts())
print(f"\nNumber of unique sentiments: {df['Sentiment'].nunique()}")
print(df.head())


Unique Sentiments:
Sentiment
Positive                45
Joy                     44
Excitement              37
Contentment             19
Neutral                 18
                        ..
Celestial Wonder         1
Nature's Beauty          1
Thrilling Journey        1
Whispers of the Past     1
Relief                   1
Name: count, Length: 191, dtype: int64

Number of unique sentiments: 191
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text Sentiment  \
0   Enjoying a beautiful day at the park!        ...  Positive   
1   Traffic was terrible this morning.           ...  Negative   
2   Just finished an amazing workout! ðŸ’ª          ...  Positive   
3   Excited about the upcoming weekend getaway!  ...  Positive   
4   Trying out a new recipe for dinner tonight.  ...   Neutral   

             

In [59]:
# Check for missing values
print(df['Sentiment'].isnull().sum())

0


## Text Preprocessing

In [50]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    def clean_text(self, text):
        """Clean and preprocess text"""
        if not isinstance(text, str):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespaces
        text = ' '.join(text.split())

        return text

    def tokenize_text(self, text):
        """Tokenize text into words"""
        return word_tokenize(text)

    def remove_stopwords(self, tokens):
        """Remove stopwords from tokens"""
        return [word for word in tokens if word not in self.stop_words]

    def lemmatize_tokens(self, tokens):
        """Lemmatize tokens"""
        return [self.lemmatizer.lemmatize(word) for word in tokens]

    def stem_tokens(self, tokens):
        """Stem tokens"""
        return [self.stemmer.stem(word) for word in tokens]

    def preprocess_pipeline(self, text, use_stemming=False):
        """Complete preprocessing pipeline"""
        # Clean text
        cleaned_text = self.clean_text(text)

        # Tokenize
        tokens = self.tokenize_text(cleaned_text)

        # Remove stopwords
        tokens = self.remove_stopwords(tokens)

        # Apply stemming or lemmatization
        if use_stemming:
            tokens = self.stem_tokens(tokens)
        else:
            tokens = self.lemmatize_tokens(tokens)

        # Join back to string
        processed_text = ' '.join(tokens)

        return processed_text

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Apply preprocessing to the Text column
print("Processing text data...")
df['Processed_Text'] = df['Text'].apply(lambda x: preprocessor.preprocess_pipeline(x, use_stemming=False))

# Show sample of processed text
print("\nOriginal vs Processed Text:")
sample_idx = np.random.randint(0, len(df))
print(f"Original: {df.loc[sample_idx, 'Text']}")
print(f"Processed: {df.loc[sample_idx, 'Processed_Text']}")

Processing text data...

Original vs Processed Text:
Original: Emotional exhaustion, the weight of the world crushing weary shoulders. 
Processed: emotional exhaustion weight world crushing weary shoulder


## Feature Extraction using TF-IDF

In [51]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Use top 5000 features
    min_df=5,          # Ignore terms with document frequency < 5
    max_df=0.7,        # Ignore terms with document frequency > 70%
    ngram_range=(1, 2) # Use unigrams and bigrams
)

# Fit and transform the processed text
print("Creating TF-IDF features...")
X_tfidf = tfidf_vectorizer.fit_transform(df['Processed_Text'])

# Display feature matrix shape
print(f"TF-IDF Matrix Shape: {X_tfidf.shape}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Number of features: {len(feature_names)}")
print(f"Sample features: {feature_names[:20]}")

# Get most important words for each sentiment (optional analysis)
def get_top_tfidf_features(tfidf_matrix, feature_names, n=10):
    """Get top n features by TF-IDF score"""
    sums = tfidf_matrix.sum(axis=0)
    data = []
    for col, term in enumerate(feature_names):
        data.append((term, sums[0, col]))
    ranking = pd.DataFrame(data, columns=['term', 'rank'])
    return ranking.sort_values('rank', ascending=False).head(n)

# Get top features overall
top_features = get_top_tfidf_features(X_tfidf.sum(axis=0), feature_names, n=20)
print("\nTop 20 features by TF-IDF score:")
print(top_features)

Creating TF-IDF features...
TF-IDF Matrix Shape: (732, 307)
Number of features: 307
Sample features: ['acceptance' 'accidentally' 'accomplishment' 'achievement' 'achieving'
 'act' 'adventure' 'age' 'ahead' 'air' 'ambivalence' 'amidst' 'ancient'
 'anticipation' 'anxiety' 'around' 'art' 'attended' 'attending' 'away']

Top 20 features by TF-IDF score:
          term       rank
185        new  17.847044
72         day  16.304499
161       life  15.310601
118     friend  13.360149
106    feeling  12.381308
186      night  11.158456
178     moment  11.085681
150        joy  10.811881
43   challenge  10.648070
162       like  10.531975
79       dream  10.500017
133      heart  10.293206
155   laughter  10.042286
305      world   9.903465
212    project   9.312686
81        echo   9.197734
86     emotion   9.170434
275       time   9.134892
16         art   8.919873
22      beauty   8.895100


## Encode Sentiment Labels

In [52]:
# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Sentiment'])

# Check encoding mapping
print("Label Encoding Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

# Store class names for later use
class_names = label_encoder.classes_
print(f"\nNumber of classes: {len(class_names)}")

# Check class distribution
print("\nClass Distribution after encoding:")
unique, counts = np.unique(y, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {label} ({class_names[label]}): {count} samples")

Label Encoding Mapping:
0: Acceptance
1: Accomplishment
2: Admiration
3: Adoration
4: Adrenaline
5: Adventure
6: Affection
7: Amazement
8: Ambivalence
9: Amusement
10: Anger
11: Anticipation
12: Anxiety
13: Appreciation
14: Apprehensive
15: Arousal
16: ArtisticBurst
17: Awe
18: Bad
19: Betrayal
20: Bitter
21: Bitterness
22: Bittersweet
23: Blessed
24: Boredom
25: Breakthrough
26: Calmness
27: Captivation
28: Celebration
29: Celestial Wonder
30: Challenge
31: Charm
32: Colorful
33: Compassion
34: Compassionate
35: Confidence
36: Confident
37: Confusion
38: Connection
39: Contemplation
40: Contentment
41: Coziness
42: Creative Inspiration
43: Creativity
44: Culinary Adventure
45: CulinaryOdyssey
46: Curiosity
47: Darkness
48: Dazzle
49: Desolation
50: Despair
51: Desperation
52: Determination
53: Devastated
54: Disappointed
55: Disappointment
56: Disgust
57: Dismissive
58: DreamChaser
59: Ecstasy
60: Elation
61: Elegance
62: Embarrassed
63: Emotion
64: EmotionalStorm
65: Empathetic
66: E

## Split Data into Train and Test Sets

In [53]:
# Identify classes with only one sample to exclude them from stratified split
unique_classes, counts_classes = np.unique(y, return_counts=True)
classes_to_keep = unique_classes[counts_classes >= 2]

# Create a boolean mask for samples belonging to classes with at least 2 instances
mask = np.isin(y, classes_to_keep)

# Filter X_tfidf and y
X_filtered = X_tfidf[mask]
y_filtered = y[mask]

print(f"Original dataset size: {len(y)}")
print(f"Dataset size after filtering single-sample classes: {len(y_filtered)}")

# Split data into train and test sets using the filtered data
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.25, random_state=42, stratify=y_filtered
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

# Check class distribution in splits
print("\nClass distribution in training set:")
unique_train, counts_train = np.unique(y_train, return_counts=True)
for label, count in zip(unique_train, counts_train):
    print(f"Class {label} ({class_names[label]}): {count} samples")

print("\nClass distribution in test set:")
unique_test, counts_test = np.unique(y_test, return_counts=True)
for label, count in zip(unique_test, counts_test):
    print(f"Class {label} ({class_names[label]}): {count} samples")

Original dataset size: 732
Dataset size after filtering single-sample classes: 653

Training set size: 489
Test set size: 164
Number of features: 307

Class distribution in training set:
Class 0 (Acceptance): 6 samples
Class 1 (Accomplishment): 2 samples
Class 2 (Admiration): 3 samples
Class 3 (Adoration): 2 samples
Class 5 (Adventure): 2 samples
Class 6 (Affection): 2 samples
Class 8 (Ambivalence): 4 samples
Class 9 (Amusement): 2 samples
Class 10 (Anger): 2 samples
Class 11 (Anticipation): 2 samples
Class 12 (Anxiety): 2 samples
Class 14 (Apprehensive): 2 samples
Class 15 (Arousal): 3 samples
Class 17 (Awe): 7 samples
Class 18 (Bad): 4 samples
Class 19 (Betrayal): 4 samples
Class 20 (Bitter): 2 samples
Class 21 (Bitterness): 4 samples
Class 24 (Boredom): 3 samples
Class 26 (Calmness): 3 samples
Class 27 (Captivation): 2 samples
Class 33 (Compassion): 3 samples
Class 34 (Compassionate): 3 samples
Class 36 (Confident): 2 samples
Class 37 (Confusion): 6 samples
Class 39 (Contemplation):

## Train Classification Models And Evaluation

In [54]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """Train a model and evaluate its performance"""
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Get actual labels present in y_test and y_pred
    actual_labels = np.unique(np.concatenate((y_test, y_pred)))
    # Filter class names to match the actual labels
    filtered_class_names = [class_names[label] for label in actual_labels]

    # Generate classification report
    report = classification_report(y_test, y_pred, target_names=filtered_class_names, output_dict=True, labels=actual_labels)

    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=actual_labels)

    return {
        'model': model,
        'accuracy': accuracy,
        'report': report,
        'confusion_matrix': cm,
        'predictions': y_pred
    }

# Initialize models
models = {
    'Naive Bayes': MultinomialNB(alpha=0.1),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced'
    ),
    'SVM': SVC(kernel='linear', random_state=42, probability=True),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight='balanced'
    )
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    results[model_name] = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test, model_name
    )

    print(f"Accuracy: {results[model_name]['accuracy']:.4f}")



Training Naive Bayes...
Accuracy: 0.4451

Training Logistic Regression...
Accuracy: 0.4024

Training SVM...
Accuracy: 0.4817

Training Random Forest...
Accuracy: 0.5183
