## 11. 📊 Model Monitoring and Maintenance

Monitor deployed models and maintain performance over time.

### 📈 11.1 Performance Monitoring

```python
import logging
from datetime import datetime, timedelta
import sqlite3
from collections import defaultdict

class ModelMonitor:
    def __init__(self, db_path="model_monitoring.db"):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """Initialize monitoring database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS predictions (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                timestamp DATETIME,
                input_text TEXT,
                prediction TEXT,
                confidence FLOAT,
                processing_time FLOAT,
                model_version TEXT
            )
        ''')
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS performance_metrics (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                date DATE,
                accuracy FLOAT,
                precision FLOAT,
                recall FLOAT,
                f1_score FLOAT,
                avg_confidence FLOAT,
                total_predictions INTEGER
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def log_prediction(self, input_text, prediction, confidence, processing_time, model_version):
        """Log individual prediction"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            INSERT INTO predictions 
            (timestamp, input_text, prediction, confidence, processing_time, model_version)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (datetime.now(), input_text, prediction, confidence, processing_time, model_version))
        
        conn.commit()
        conn.close()
    
    def get_daily_stats(self, days_back=7):
        """Get daily performance statistics"""
        conn = sqlite3.connect(self.db_path)
        
        query = '''
            SELECT 
                DATE(timestamp) as date,
                COUNT(*) as total_predictions,
                AVG(confidence) as avg_confidence,
                AVG(processing_time) as avg_processing_time,
                prediction,
                COUNT(*) as prediction_count
            FROM predictions 
            WHERE timestamp >= datetime('now', '-{} days')
            GROUP BY DATE(timestamp), prediction## 8. 🔁 Advanced Hyperparameter Tuning

Optimize model performance with systematic parameter search.

### 🔍 8.1 Grid Search with Cross-Validation

```python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(random_state=42))
])

# Define parameter grid
param_grid = {
    'tfidf__max_features': [5000, 10000, 15000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

# Grid search
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, 
    scoring='f1_weighted', n_jobs=-1, verbose=1
)

grid_search.fit(X_train_text, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
```

### ⚖️ 5.11 Final Feature Matrix with Optimal Dimensions

```python
# Create final optimized feature matrix
def create_optimized_feature_matrix():
    """
    Create final feature matrix with optimal dimensions
    """
    # Apply optimal dimensionality reduction
    pca_optimal = PCA(n_components=optimal_dense, random_state=42)
    svd_optimal = TruncatedSVD(n_components=optimal_tfidf, random_state=42)
    
    # Transform features
    dense_features_optimal = pca_optimal.fit_transform(
        StandardScaler().fit_transform(dense_features_combined)
    )
    tfidf_features_optimal = svd_optimal.fit_transform(X_word_tfidf)
    
    # Combine optimized features
    X_optimized = np.hstack([
        tfidf_features_optimal,     # Optimized TF-IDF features
        dense_features_optimal,     # Optimized dense features
    ])
    
    print(f"Optimized feature matrix shape: {X_optimized.shape}")
    print(f"Dimension reduction: {X_combined.shape[1]} -> {X_optimized.shape[1]}")
    print(f"Reduction ratio: {X_optimized.shape[1]/X_combined.shape[1]:.2%}")
    
    return X_optimized, pca_optimal, svd_optimal

X_optimized, pca_final, svd_final = create_optimized_feature_matrix()
```

---

### 🎲 8.2 Randomized Search

```python
from scipy.stats import uniform, randint

# Define parameter distributions
param_dist = {
    'tfidf__max_features': randint(5000, 20000),
    'tfidf__min_df': randint(1, 5),
    'tfidf__max_df': uniform(0.7, 0.3),
    'classifier__C': uniform(0.1, 10),
    'classifier__penalty': ['l1', 'l2']
}

# Randomized search
random_search = RandomizedSearchCV(
    pipeline, param_dist, n_iter=50, cv=5,
    scoring='f1_weighted', n_jobs=-1, random_state=42
)

random_search.fit(X_train_text, y_train)

print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
```

---

### 🏆 8.3 Bayesian Optimization

```python
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

# Define search space
search_space = {
    'tfidf__max_features': Integer(5000, 20000),
    'tfidf__min_df': Integer(1, 5),
    'tfidf__max_df': Real(0.7, 1.0),
    'classifier__C': Real(0.01, 100, prior='log-uniform'),
    'classifier__penalty': Categorical(['l1', 'l2'])
}

# Bayesian optimization
bayes_search = BayesSearchCV(
    pipeline, search_space, n_iter=30, cv=5,
    scoring='f1_weighted', n_jobs=-1, random_state=42
)

bayes_search.fit(X_train_text, y_train)

print("Best parameters:", bayes_search.best_params_)
print("Best cross-validation score:", bayes_search.best_score_)
```

---

### 📊 8.4 Hyperparameter Tuning Visualization

```python
def plot_validation_curve(estimator, X, y, param_name, param_range):
    from sklearn.model_selection import validation_curve
    
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=param_range,
        cv=5, scoring='f1_weighted', n_jobs=-1
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(param_range, train_mean, 'o-', color='blue# 🧠 NLP Model Training Pipeline

This document outlines the full pipeline of tasks for training an NLP model, starting from raw text data to model deployment. Each step includes techniques and tools commonly used in modern NLP workflows.

---

## 0. 📊 Data Gathering

Collect and prepare text data from various sources for NLP model training.

### 📂 0.1 Data Sources

```python
import pandas as pd
import requests
import os
from pathlib import Path

# Local files
def load_local_data():
    # CSV files
    df_csv = pd.read_csv('data/text_data.csv')
    
    # JSON files
    df_json = pd.read_json('data/text_data.json')
    
    # Text files
    with open('data/documents.txt', 'r', encoding='utf-8') as f:
        text_data = f.readlines()
    
    return df_csv, df_json, text_data

# Web scraping
def scrape_web_data(urls):
    from bs4 import BeautifulSoup
    import time
    
    scraped_data = []
    for url in urls:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()
            scraped_data.append({'url': url, 'text': text})
            time.sleep(1)  # Be respectful
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    
    return pd.DataFrame(scraped_data)
```

---

### 🔌 0.2 API Data Collection

```python
# Twitter API (example)
def collect_twitter_data(query, count=100):
    # Using tweepy or similar library
    import tweepy
    
    # API authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)
    
    tweets = []
    for tweet in tweepy.Cursor(api.search_tweets, q=query, lang="en").items(count):
        tweets.append({
            'id': tweet.id,
            'text': tweet.text,
            'created_at': tweet.created_at,
            'user': tweet.user.screen_name,
            'retweet_count': tweet.retweet_count,
            'favorite_count': tweet.favorite_count
        })
    
    return pd.DataFrame(tweets)

# Reddit API
def collect_reddit_data(subreddit, limit=100):
    import praw
    
    reddit = praw.Reddit(
        client_id="your_client_id",
        client_secret="your_client_secret",
        user_agent="your_user_agent"
    )
    
    posts = []
    for post in reddit.subreddit(subreddit).hot(limit=limit):
        posts.append({
            'title': post.title,
            'text': post.selftext,
            'score': post.score,
            'num_comments': post.num_comments,
            'created_utc': post.created_utc
        })
    
    return pd.DataFrame(posts)
```

---

### 🗄️ 0.3 Database Integration

```python
import sqlite3
from sqlalchemy import create_engine

# SQLite
def load_from_sqlite(db_path, query):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

# PostgreSQL/MySQL
def load_from_sql_server(connection_string, query):
    engine = create_engine(connection_string)
    df = pd.read_sql_query(query, engine)
    return df

# Example usage
# df = load_from_sqlite('data/texts.db', 'SELECT * FROM documents')
# df = load_from_sql_server('postgresql://user:pass@localhost/db', 'SELECT * FROM reviews')
```

---

### 📋 0.4 Data Cataloging and Metadata

```python
def create_data_catalog(dataframes_dict):
    catalog = []
    
    for name, df in dataframes_dict.items():
        info = {
            'dataset_name': name,
            'shape': df.shape,
            'columns': list(df.columns),
            'text_columns': [col for col in df.columns if df[col].dtype == 'object'],
            'missing_values': df.isnull().sum().to_dict(),
            'data_types': df.dtypes.to_dict(),
            'memory_usage': df.memory_usage(deep=True).sum(),
            'sample_text': df.iloc[0].to_dict() if not df.empty else None
        }
        catalog.append(info)
    
    return pd.DataFrame(catalog)

# Example usage
datasets = {
    'reviews': df_reviews,
    'tweets': df_tweets,
    'articles': df_articles
}
catalog = create_data_catalog(datasets)
print(catalog)
```

---

### 🔄 0.5 Data Integration and Consolidation

```python
def consolidate_datasets(datasets_dict, text_column_mapping):
    """
    Consolidate multiple datasets into a single dataframe
    """
    consolidated_data = []
    
    for dataset_name, df in datasets_dict.items():
        text_col = text_column_mapping.get(dataset_name, 'text')
        
        # Standardize columns
        standardized_df = df.copy()
        standardized_df['text'] = df[text_col]
        standardized_df['source'] = dataset_name
        standardized_df['id'] = range(len(df))
        
        # Select relevant columns
        columns_to_keep = ['id', 'text', 'source']
        if 'label' in df.columns:
            columns_to_keep.append('label')
        
        consolidated_data.append(standardized_df[columns_to_keep])
    
    # Combine all datasets
    final_df = pd.concat(consolidated_data, ignore_index=True)
    
    print(f"Consolidated dataset shape: {final_df.shape}")
    print(f"Sources: {final_df['source'].value_counts()}")
    
    return final_df

# Example usage
text_mapping = {
    'reviews': 'review_text',
    'tweets': 'tweet_text',
    'articles': 'content'
}
consolidated_df = consolidate_datasets(datasets, text_mapping)
```

---

## 1. 🧼 Data Cleaning

### 🔡 1.1 Lowercasing
Convert all characters to lowercase to ensure consistency.

```python
text = text.lower()
```

---

### 🧽 1.2 Removing Leading and Trailing Spaces

Remove unnecessary whitespace.

```python
text = text.strip()
```

---

### 🏷️ 1.3 Removing HTML Tags

Strip out HTML elements using regex.

```python
import re
text = re.sub(r'<.*?>', '', text)
```

---

### 🔗 1.4 Removing URLs

Remove links from the text.

```python
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
```

---

### ✏️ 1.5 Expanding Abbreviations

Convert contractions to their full forms.

```python
import contractions
text = contractions.fix(text)
```

---

### ✅ 1.6 Spelling Correction

Correct spelling mistakes.

```python
from textblob import TextBlob
text = str(TextBlob(text).correct())
```

---

### ❗ 1.7 Removing Punctuation

Remove punctuation marks.

```python
import string
text = text.translate(str.maketrans('', '', string.punctuation))
```

---

### 🔣 1.8 Removing Special Characters

Remove all non-alphanumeric characters.

```python
text = re.sub(r'[^A-Za-z0-9\s]', '', text)
```

---

## 2. 🧪 Text Preprocessing

### 🔪 2.1 Tokenization

Split the text into individual tokens (words).

```python
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
```

---

### 🛑 2.2 Stopword Removal

Remove common stopwords like "the", "is", "in".

```python
from nltk.corpus import stopwords
tokens = [word for word in tokens if word not in stopwords.words('english')]
```

---

### 🌿 2.3 Lemmatization or Stemming

Reduce words to their base/root form.

**Lemmatization:**

```python
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
```

**Stemming:**

```python
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
```

---

### 🧱 2.4 Joining Back the Tokens

Reconstruct the cleaned sentence (if needed).

```python
clean_text = " ".join(tokens)
```

---

## 3. 📊 Exploratory Data Analysis (EDA)

Understand your text data through comprehensive analysis.

### 📈 3.1 Basic Statistics

```python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Basic text statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
df['sentence_count'] = df['text'].str.count(r'[.!?]+')

print(f"Average text length: {df['text_length'].mean():.2f}")
print(f"Average word count: {df['word_count'].mean():.2f}")
```

---

### 📊 3.2 Distribution Analysis

```python
# Distribution plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['text_length'], bins=50, alpha=0.7)
plt.title('Text Length Distribution')
plt.xlabel('Character Count')

plt.subplot(1, 3, 2)
plt.hist(df['word_count'], bins=50, alpha=0.7)
plt.title('Word Count Distribution')
plt.xlabel('Word Count')

plt.subplot(1, 3, 3)
df['label'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()
```

---

### ☁️ 3.3 Word Cloud Visualization

Generate word clouds to visualize the most frequent terms.

```python
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Overall word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(clean_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Overall Word Cloud')
plt.show()

# Class-specific word clouds
for label in df['label'].unique():
    class_text = ' '.join(df[df['label'] == label]['clean_text'])
    wordcloud = WordCloud(width=400, height=300, background_color='white').generate(class_text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud - {label}')
    plt.show()
```

---

### 🔍 3.4 Most Frequent Words Analysis

```python
from collections import Counter
import nltk

# Most frequent words overall
all_words = ' '.join(df['clean_text']).split()
word_freq = Counter(all_words)
top_words = word_freq.most_common(20)

plt.figure(figsize=(12, 6))
words, counts = zip(*top_words)
plt.bar(words, counts)
plt.title('Top 20 Most Frequent Words')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
```

---

### 📏 3.5 N-gram Analysis

```python
from sklearn.feature_extraction.text import CountVectorizer

# Bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=20)
bigrams = bigram_vectorizer.fit_transform(df['clean_text'])
bigram_freq = bigrams.sum(axis=0).A1
bigram_names = bigram_vectorizer.get_feature_names_out()

plt.figure(figsize=(12, 6))
indices = bigram_freq.argsort()[-20:]
plt.barh(range(len(indices)), bigram_freq[indices])
plt.yticks(range(len(indices)), [bigram_names[i] for i in indices])
plt.title('Top 20 Bigrams')
plt.tight_layout()
plt.show()
```

---

## 4. 🔧 Advanced Feature Engineering

Create comprehensive features from text data.

### 📊 4.1 Statistical Features

```python
import numpy as np
from textstat import flesch_reading_ease, flesch_kincaid_grade

def extract_statistical_features(text):
    return {
        'char_count': len(text),
        'word_count': len(text.split()),
        'sentence_count': len(text.split('.')),
        'avg_word_length': np.mean([len(word) for word in text.split()]),
        'punctuation_count': sum([1 for char in text if char in string.punctuation]),
        'uppercase_count': sum([1 for char in text if char.isupper()]),
        'digit_count': sum([1 for char in text if char.isdigit()]),
        'readability_score': flesch_reading_ease(text),
        'grade_level': flesch_kincaid_grade(text)
    }

# Apply to dataframe
statistical_features = df['text'].apply(extract_statistical_features)
statistical_df = pd.DataFrame(statistical_features.tolist())
```

---

### 🏷️ 4.2 Part-of-Speech Features

```python
import nltk
from collections import Counter

def extract_pos_features(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_counts = Counter([tag for word, tag in pos_tags])
    
    total_words = len(tokens)
    return {
        'noun_ratio': pos_counts.get('NN', 0) / total_words,
        'verb_ratio': pos_counts.get('VB', 0) / total_words,
        'adj_ratio': pos_counts.get('JJ', 0) / total_words,
        'adv_ratio': pos_counts.get('RB', 0) / total_words
    }

pos_features = df['clean_text'].apply(extract_pos_features)
pos_df = pd.DataFrame(pos_features.tolist())
```

---

### 😊 4.3 Sentiment Features

```python
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def extract_sentiment_features(text):
    # TextBlob sentiment
    blob = TextBlob(text)
    
    # VADER sentiment
    vader_scores = analyzer.polarity_scores(text)
    
    return {
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        'vader_compound': vader_scores['compound'],
        'vader_positive': vader_scores['pos'],
        'vader_negative': vader_scores['neg'],
        'vader_neutral': vader_scores['neu']
    }

sentiment_features = df['clean_text'].apply(extract_sentiment_features)
sentiment_df = pd.DataFrame(sentiment_features.tolist())
```

---

## 5. 🔢 Text Vectorization

Convert text into numerical representations for machine learning.

### 🧮 5.1 Bag of Words (BoW)

```python
from sklearn.feature_extraction.text import CountVectorizer

# Basic BoW
bow_vectorizer = CountVectorizer(
    max_features=5000,
    min_df=2,
    max_df=0.8,
    stop_words='english'
)
X_bow = bow_vectorizer.fit_transform(df['clean_text'])

print(f"BoW shape: {X_bow.shape}")
```

---

### 🧠 5.2 TF-IDF Vectorization

```python
from sklearn.feature_extraction.text import TfidfVectorizer

# Character-level TF-IDF
char_tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000,
    strip_accents='unicode',
    lowercase=True
)

# Word-level TF-IDF
word_tfidf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 3),
    max_features=30000,
    min_df=2,
    max_df=0.8,
    stop_words='english'
)

X_char_tfidf = char_tfidf.fit_transform(df['clean_text'])
X_word_tfidf = word_tfidf.fit_transform(df['clean_text'])

print(f"Character TF-IDF shape: {X_char_tfidf.shape}")
print(f"Word TF-IDF shape: {X_word_tfidf.shape}")
```

---

### 🔡 5.3 Word Embeddings

```python
# Word2Vec
from gensim.models import Word2Vec
import numpy as np

def create_word2vec_features(texts, vector_size=100):
    # Tokenize texts
    tokenized_texts = [text.split() for text in texts]
    
    # Train Word2Vec model
    model = Word2Vec(tokenized_texts, vector_size=vector_size, window=5, min_count=1, workers=4)
    
    # Create document vectors by averaging word vectors
    def text_to_vector(text):
        words = text.split()
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(vector_size)
    
    return np.array([text_to_vector(text) for text in texts])

X_word2vec = create_word2vec_features(df['clean_text'])
print(f"Word2Vec shape: {X_word2vec.shape}")
```

---

### 🤗 5.4 Pre-trained Embeddings (BERT)

```python
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, max_length=512):
    embeddings = []
    
    for text in texts:
        # Tokenize and encode
        inputs = tokenizer(text, return_tensors='pt', max_length=max_length, 
                          truncation=True, padding=True)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding
            embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(embedding.flatten())
    
    return np.array(embeddings)

# For small datasets only (BERT is computationally expensive)
# X_bert = get_bert_embeddings(df['clean_text'].head(100))
```

---

### 🔄 5.6 Principal Component Analysis (PCA)

Reduce dimensionality of high-dimensional text features while preserving variance.

```python
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# PCA for dense features (statistical, sentiment, POS features)
def apply_pca_dense_features(dense_features, n_components=0.95):
    """
    Apply PCA to dense features
    n_components: float (0-1) for variance ratio or int for number of components
    """
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(dense_features)
    
    pca = PCA(n_components=n_components, random_state=42)
    features_pca = pca.fit_transform(features_scaled)
    
    print(f"Original features shape: {dense_features.shape}")
    print(f"PCA features shape: {features_pca.shape}")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
    
    return features_pca, pca, scaler

# Apply PCA to combined dense features
dense_features_combined = np.hstack([statistical_scaled, sentiment_scaled, pos_scaled])
features_pca, pca_model, scaler_model = apply_pca_dense_features(dense_features_combined)
```

---

### 📊 5.7 Truncated SVD for Sparse Features

```python
# Truncated SVD for sparse TF-IDF features
def apply_truncated_svd(sparse_features, n_components=100):
    """
    Apply Truncated SVD to sparse features (like TF-IDF)
    """
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    features_svd = svd.fit_transform(sparse_features)
    
    print(f"Original sparse features shape: {sparse_features.shape}")
    print(f"SVD features shape: {features_svd.shape}")
    print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}")
    
    return features_svd, svd

# Apply SVD to TF-IDF features
X_tfidf_svd, svd_model = apply_truncated_svd(X_word_tfidf, n_components=200)
X_char_tfidf_svd, char_svd_model = apply_truncated_svd(X_char_tfidf, n_components=100)
```

---

### 📈 5.8 PCA Visualization and Analysis

```python
def plot_pca_analysis(pca_model, features_pca, max_components=20):
    """
    Visualize PCA results
    """
    # Explained variance plot
    plt.figure(figsize=(15, 5))
    
    # Individual explained variance
    plt.subplot(1, 3, 1)
    plt.plot(range(1, min(len(pca_model.explained_variance_ratio_), max_components) + 1), 
             pca_model.explained_variance_ratio_[:max_components], 'bo-')
    plt.title('Explained Variance by Component')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.grid(True)
    
    # Cumulative explained variance
    plt.subplot(1, 3, 2)
    cumsum = np.cumsum(pca_model.explained_variance_ratio_[:max_components])
    plt.plot(range(1, len(cumsum) + 1), cumsum, 'ro-')
    plt.title('Cumulative Explained Variance')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid(True)
    
    # 2D PCA scatter plot (if we have labels)
    if features_pca.shape[1] >= 2:
        plt.subplot(1, 3, 3)
        plt.scatter(features_pca[:, 0], features_pca[:, 1], alpha=0.6)
        plt.title('First Two Principal Components')
        plt.xlabel(f'PC1 ({pca_model.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca_model.explained_variance_ratio_[1]:.2%} variance)')
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# Visualize PCA results
plot_pca_analysis(pca_model, features_pca)
```

---

### 🔗 5.9 Advanced Feature Combination with Dimensionality Reduction

```python
# Combine all reduced features
def create_final_feature_matrix():
    """
    Combine all features after dimensionality reduction
    """
    # Stack all features
    X_final = np.hstack([
        X_tfidf_svd,           # Reduced TF-IDF features
        X_char_tfidf_svd,      # Reduced character-level features  
        features_pca,          # Reduced dense features
        X_word2vec[:, :50] if X_word2vec.shape[1] > 50 else X_word2vec  # Reduced word2vec
    ])
    
    print(f"Final feature matrix shape: {X_final.shape}")
    
    # Feature names for interpretability
    feature_names = (
        [f'tfidf_svd_{i}' for i in range(X_tfidf_svd.shape[1])] +
        [f'char_tfidf_svd_{i}' for i in range(X_char_tfidf_svd.shape[1])] +
        [f'dense_pca_{i}' for i in range(features_pca.shape[1])] +
        [f'word2vec_{i}' for i in range(X_word2vec.shape[1] if X_word2vec.shape[1] <= 50 else 50)]
    )
    
    return X_final, feature_names

X_final, final_feature_names = create_final_feature_matrix()
```

---

### 🎯 5.10 Optimal Number of Components Selection

```python
def find_optimal_components(data, method='pca', max_components=100, variance_threshold=0.95):
    """
    Find optimal number of components for dimensionality reduction
    """
    if method == 'pca':
        reducer = PCA(n_components=max_components, random_state=42)
        data_scaled = StandardScaler().fit_transform(data)
        reducer.fit(data_scaled)
        cumsum = np.cumsum(reducer.explained_variance_ratio_)
    elif method == 'svd':
        reducer = TruncatedSVD(n_components=max_components, random_state=42)
        reducer.fit(data)
        cumsum = np.cumsum(reducer.explained_variance_ratio_)
    
    # Find number of components for desired variance
    optimal_components = np.argmax(cumsum >= variance_threshold) + 1
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(cumsum) + 1), cumsum, 'b-', linewidth=2)
    plt.axhline(y=variance_threshold, color='r', linestyle='--', 
                label=f'{variance_threshold:.0%} variance threshold')
    plt.axvline(x=optimal_components, color='g', linestyle='--', 
                label=f'Optimal components: {optimal_components}')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title(f'Optimal Components Selection - {method.upper()}')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    print(f"Optimal number of components: {optimal_components}")
    print(f"Variance explained: {cumsum[optimal_components-1]:.3f}")
    
    return optimal_components

# Find optimal components for different feature sets
optimal_dense = find_optimal_components(dense_features_combined, method='pca')
optimal_tfidf = find_optimal_components(X_word_tfidf.toarray(), method='svd', max_components=200)
```

---

## 6. 🤖 Advanced Model Selection & Training

Choose and train appropriate models for your NLP task.

### 📊 6.1 Traditional Machine Learning Models

```python
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train models
trained_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    trained_models[name] = model
```

---

### 🧠 6.2 Deep Learning Models

```python
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare data for deep learning
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(df['clean_text'])

X_sequences = tokenizer.texts_to_sequences(df['clean_text'])
X_padded = pad_sequences(X_sequences, maxlen=100)

# Split data
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X_padded, pd.get_dummies(df['label']), test_size=0.2, random_state=42
)

# LSTM Model
def create_lstm_model(vocab_size, embedding_dim=100, max_length=100, num_classes=2):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

lstm_model = create_lstm_model(10000, num_classes=len(df['label'].unique()))

# Train LSTM
history = lstm_model.fit(
    X_train_dl, y_train_dl,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)
```

---

### 🏆 6.3 Ensemble Methods

```python
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# Create ensemble
ensemble = VotingClassifier([
    ('lr', LogisticRegression(random_state=42, max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
], voting='soft')

# Train ensemble
ensemble.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
```

---

## 7. 📊 Comprehensive Model Evaluation

Evaluate model performance using multiple metrics and techniques.

### 📈 7.1 Basic Metrics

```python
from sklearn.metrics import (classification_report, confusion_matrix, 
                           accuracy_score, precision_recall_fscore_support,
                           roc_auc_score, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test, model_name):
    # Predictions
    y_pred = model.predict(X_test)
    
    # Basic metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    
    print(f"\n=== {model_name} Performance ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Evaluate all models
results = []
for name, model in trained_models.items():
    result = evaluate_model(model, X_test, y_test, name)
    results.append(result)

# Results comparison
results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
print(results_df.sort_values('f1', ascending=False))
```

---

### 🎯 7.2 Confusion Matrix Visualization

```python
def plot_confusion_matrix(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Plot confusion matrices for best models
for name, model in list(trained_models.items())[:3]:
    plot_confusion_matrix(model, X_test, y_test, name)
```

---

### 📊 7.3 ROC Curve Analysis

```python
from sklearn.preprocessing import label_binarize
from itertools import cycle

def plot_roc_curves(models, X_test, y_test):
    plt.figure(figsize=(12, 8))
    colors = cycle(['blue', 'red', 'green', 'orange', 'purple'])
    
    for (name, model), color in zip(models.items(), colors):
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)
            
            # For binary classification
            if len(np.unique(y_test)) == 2:
                fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1], 
                                      pos_label=model.classes_[1])
                auc_score = roc_auc_score(y_test, y_proba[:, 1])
                plt.plot(fpr, tpr, color=color, linewidth=2,
                        label=f'{name} (AUC = {auc_score:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--', linewidth=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves Comparison')
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curves(trained_models, X_test, y_test)
```

---

### 🔍 7.4 Feature Importance Analysis

```python
def analyze_feature_importance(model, feature_names, top_n=20):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_[0])
    else:
        print("Model doesn't support feature importance analysis")
        return
    
    # Get top features
    indices = np.argsort(importances)[-top_n:]
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(indices)), importances[indices])
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Feature Importance')
    plt.title(f'Top {top_n} Important Features')
    plt.tight_layout()
    plt.show()

# Analyze feature importance for Random Forest
if 'Random Forest' in trained_models:
    # Get feature names (this is a simplified example)
    feature_names = word_tfidf.get_feature_names_out().tolist()
    analyze_feature_importance(trained_models['Random Forest'], feature_names)
```

---

### 📉 7.5 Learning Curves

```python
from sklearn.model_selection import learning_curve

def plot_learning_curves(model, X, y, model_name):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
    
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy Score')
    plt.title(f'Learning Curves - {model_name}')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# Plot learning curves for best model
best_model_name = results_df.loc[results_df['f1'].idxmax(), 'model']
plot_learning_curves(trained_models[best_model_name], X_train, y_train, best_model_name)
```

---

## 8. 🔁 Hyperparameter Tuning

Optimize model performance with grid search or randomized search.

```python
from sklearn.model_selection import GridSearchCV

params = {'C': [0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid=params, cv=5)
grid.fit(X_tfidf, y)
```

---

## 10. 🚀 Model Deployment

Deploy your trained model for production use.

### 💾 10.1 Model Serialization and Saving

```python
import pickle
import joblib
from datetime import datetime

# Save trained models and preprocessors
def save_model_pipeline(model, vectorizers, scalers, model_name):
    """
    Save complete model pipeline including preprocessors
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_dir = f"models/{model_name}_{timestamp}"
    os.makedirs(model_dir, exist_ok=True)
    
    # Save main model
    joblib.dump(model, f"{model_dir}/model.pkl")
    
    # Save preprocessors
    joblib.dump(vectorizers, f"{model_dir}/vectorizers.pkl")
    joblib.dump(scalers, f"{model_dir}/scalers.pkl")
    
    # Save metadata
    metadata = {
        'model_type': type(model).__name__,
        'timestamp': timestamp,
        'feature_shape': X_optimized.shape,
        'classes': model.classes_.tolist() if hasattr(model, 'classes_') else None
    }
    
    with open(f"{model_dir}/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Model saved to: {model_dir}")
    return model_dir

# Save best performing model
best_model = trained_models[best_model_name]
model_path = save_model_pipeline(
    best_model, 
    {'tfidf': svd_final, 'dense': pca_final},
    {'scaler': scaler_model},
    best_model_name.lower().replace(' ', '_')
)
```

---

### 🌐 10.2 REST API Deployment with Flask

```python
from flask import Flask, request, jsonify
import numpy as np

app = Flask(__name__)

# Load model and preprocessors
def load_model_pipeline(model_path):
    model = joblib.load(f"{model_path}/model.pkl")
    vectorizers = joblib.load(f"{model_path}/vectorizers.pkl")
    scalers = joblib.load(f"{model_path}/scalers.pkl")
    
    with open(f"{model_path}/metadata.json", 'r') as f:
        metadata = json.load(f)
    
    return model, vectorizers, scalers, metadata

# Global model variables
MODEL, VECTORIZERS, SCALERS, METADATA = load_model_pipeline(model_path)

def preprocess_text_for_api(text):
    """
    Preprocess single text for API prediction
    """
    # Apply same preprocessing steps
    clean_text = preprocess_single_text(text)  # Your preprocessing function
    
    # Extract features
    statistical_feat = extract_statistical_features(text)
    sentiment_feat = extract_sentiment_features(clean_text)
    pos_feat = extract_pos_features(clean_text)
    
    # Vectorize
    tfidf_feat = VECTORIZERS['tfidf'].transform([clean_text])
    
    # Scale dense features
    dense_features = np.array([[
        statistical_feat['char_count'], statistical_feat['word_count'],
        sentiment_feat['textblob_polarity'], sentiment_feat['vader_compound']
        # Add other features...
    ]])
    dense_scaled = SCALERS['scaler'].transform(dense_features)
    dense_pca = VECTORIZERS['dense'].transform(dense_scaled)
    
    # Combine features
    final_features = np.hstack([tfidf_feat.toarray(), dense_pca])
    return final_features

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        text = data.get('text', '')
        
        if not text:
            return jsonify({'error': 'No text provided'}), 400
        
        # Preprocess and predict
        features = preprocess_text_for_api(text)
        prediction = MODEL.predict(features)[0]
        
        # Get prediction probabilities if available
        if hasattr(MODEL, 'predict_proba'):
            probabilities = MODEL.predict_proba(features)[0]
            prob_dict = {
                class_name: float(prob) 
                for class_name, prob in zip(MODEL.classes_, probabilities)
            }
        else:
            prob_dict = {}
        
        return jsonify({
            'prediction': prediction,
            'probabilities': prob_dict,
            'model_info': {
                'model_type': METADATA['model_type'],
                'timestamp': METADATA['timestamp']
            }
        })
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'healthy', 'model_loaded': MODEL is not None})

if __name__ == '__main__':
    app.run(debug=False, host='0.0.0.0', port=5000)
```

---

### ⚡ 10.3 FastAPI Deployment (High Performance)

```python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

app = FastAPI(title="NLP Model API", version="1.0.0")

# Load model (same as Flask)
MODEL, VECTORIZERS, SCALERS, METADATA = load_model_pipeline(model_path)

class TextRequest(BaseModel):
    text: str
    return_probabilities: bool = True

class PredictionResponse(BaseModel):
    prediction: str
    probabilities: dict = {}
    confidence: float
    model_info: dict

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: TextRequest):
    try:
        if not request.text.strip():
            raise HTTPException(status_code=400, detail="Empty text provided")
        
        # Preprocess and predict
        features = preprocess_text_for_api(request.text)
        prediction = MODEL.predict(features)[0]
        
        probabilities = {}
        confidence = 0.0
        
        if hasattr(MODEL, 'predict_proba') and request.return_probabilities:
            probs = MODEL.predict_proba(features)[0]
            probabilities = {
                class_name: float(prob) 
                for class_name, prob in zip(MODEL.classes_, probs)
            }
            confidence = float(max(probs))
        
        return PredictionResponse(
            prediction=prediction,
            probabilities=probabilities,
            confidence=confidence,
            model_info={
                'model_type': METADATA['model_type'],
                'timestamp': METADATA['timestamp']
            }
        )
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "healthy", "model_loaded": MODEL is not None}

# Run with: uvicorn main:app --host 0.0.0.0 --port 8000
```

---

### 🐳 10.4 Docker Containerization

```dockerfile
# Dockerfile
FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    gcc \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Create models directory
RUN mkdir -p models

# Expose port
EXPOSE 8000

# Run the application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
```

```yaml
# docker-compose.yml
version: '3.8'
services:
  nlp-api:
    build: .
    ports:
      - "8000:8000"
    volumes:
      - ./models:/app/models
    environment:
      - MODEL_PATH=/app/models/best_model
    restart: unless-stopped
  
  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf
    depends_on:
      - nlp-api
    restart: unless-stopped
```

---

### ☁️ 10.5 Cloud Deployment Options

```python
# AWS Lambda deployment
import json
import boto3
import base64

def lambda_handler(event, context):
    """
    AWS Lambda function for serverless deployment
    """
    try:
        # Parse input
        if 'body' in event:
            body = json.loads(event['body'])
        else:
            body = event
        
        text = body.get('text', '')
        
        # Load model (use S3 or container)
        # model = load_model_from_s3()
        
        # Predict
        features = preprocess_text_for_api(text)
        prediction = MODEL.predict(features)[0]
        
        return {
            'statusCode': 200,
            'headers': {
                'Content-Type': 'application/json',
                'Access-Control-Allow-Origin': '*'
            },
            'body': json.dumps({
                'prediction': prediction,
                'status': 'success'
            })
        }
    
    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps({
                'error': str(e),
                'status': 'error'
            })
        }

# Google Cloud Functions
def gcp_cloud_function(request):
    """
    Google Cloud Function deployment
    """
    if request.method == 'POST':
        request_json = request.get_json()
        if request_json and 'text' in request_json:
            text = request_json['text']
            
            # Process and predict
            features = preprocess_text_for_api(text)
            prediction = MODEL.predict(features)[0]
            
            return {'prediction': prediction}
    
    return {'error': 'Invalid request'}
```

---

## ✅ Summary

This pipeline includes:

1. **Cleaning** 🧼
2. **Preprocessing** 🧪
3. **Visualization** ☁️
4. **Feature Engineering** ⚙️
5. **Model Training** 🤖
6. **Evaluation** 📊
7. **Tuning** 🔁
8. **Deployment** 🚀

A full cycle for building robust and scalable NLP solutions.

In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df['review'][0]


"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df.duplicated().sum()

np.int64(418)

In [8]:
df = df.drop_duplicates()

In [9]:
# Cleaning

# lower case
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
df.loc[:, 'review'] = df['review'].str.lower()


In [12]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [13]:
df.loc[:, 'review'] = df['review'].str.strip()


In [14]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [15]:
df.loc[:, 'review'] = df['review'].str.replace(r'<.*?>', '', regex=True)


In [16]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [17]:
df.loc[:, 'review'] = df['review'].str.replace(r"https?://\S+|www\.\S+", '', regex=True)


In [18]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [20]:
url_reviews = df[df['review'].str.contains(r"https?://\S+|www\.\S+", regex=True)]

if len(url_reviews) > 4:
    print(url_reviews.iloc[4].values)
else:
    print(f"Only {len(url_reviews)} review(s) contain URLs. Here's what we found:")
    print(url_reviews.values)


Only 0 review(s) contain URLs. Here's what we found:
[]


In [23]:
df.loc[:, 'review'] = df['review'].apply(lambda x: contractions.fix(str(x)) if pd.notnull(x) else x)


In [24]:
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there is a family where a little boy...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i am going to have to disagree with the previo...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object

In [31]:
%pip install pyspellchecker

from spellchecker import SpellChecker
import pandas as pd

# Create an instance of SpellChecker
spell = SpellChecker()

# Optimized function for spelling correction
def spelling_correction(texts):
    # Vectorized approach: split texts into words, correct, and rejoin
    corrected = [' '.join(spell.correction(word) or word for word in text.split()) for text in texts]
    return pd.Series(corrected)

# Apply to the 'review' column in a vectorized manner
df['review'] = spelling_correction(df['review'])
df['review'].head()

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
   ---------------------------------------- 0.0/7.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/7.2 MB ? eta -:--:--
   -- ------------------------------------- 0.5/7.2 MB 1.5 MB/s eta 0:00:05
   ---- ----------------------------------- 0.8/7.2 MB 1.8 MB/s eta 0:00:04
   ------- -------------------------------- 1.3/7.2 MB 2.0 MB/s eta 0:00:04
   ----------- ---------------------------- 2.1/7.2 MB 2.3 MB/s eta 0:00:03
   -------------- ------------------------- 2.6/7.2 MB 2.5 MB/s eta 0:00:02
   ------------------ --------------------- 3.4/7.2 MB 2.7 MB/s eta 0:00:02
   ----------------------- ---------------- 4.2/7.2 MB 2.8 MB/s eta 0:00:02
   -------------------------- ------------- 4.7/7.2 MB 2.8 MB/s eta 0:00:01
   ------------------------------- -------- 5.8/7.2 MB 3.0 MB/s eta 0:00:01
   --------------------

KeyboardInterrupt: 