In [2]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.4-cp313-cp313-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.4-cp313-cp313-win_amd64.whl (300 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4


In [5]:
# ============================================================
# üìò NOTEBOOK 1: Data Preprocessing
# ============================================================
# Project: Text, Social Media & Web Analytics Capstone
# Author: Abhishek Gantana
# Purpose: Load raw text datasets, clean & preprocess text,
#          save cleaned versions, and visualize word patterns.
# ============================================================

# ---------- 1. Library Imports ----------
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')  # ‚Üê This one is the new requirement for newer NLTK versions


# Download NLTK dependencies (only first run)
nltk.download('stopwords')
nltk.download('punkt')

# ---------- 2. Folder Setup ----------
DATA_DIR = "data"
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"üìÅ Data Directory: {DATA_DIR}")
print(f"üìÇ Outputs Directory: {OUTPUT_DIR}")

# ---------- 3. Helper Functions ----------
def clean_text(text):
    """Clean text by removing URLs, mentions, emojis, stopwords, and punctuation."""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'@\w+|#', '', text)  # remove mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # keep only letters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

def generate_wordcloud(text_series, title, save_name):
    """Generate and save a word cloud image."""
    text_combined = " ".join(text_series.dropna())
    wc = WordCloud(width=900, height=500, background_color="white", colormap="plasma").generate(text_combined)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, fontsize=14)
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/{save_name}.png"
    plt.savefig(path, bbox_inches="tight")
    plt.close()
    print(f"‚úÖ WordCloud saved: {path}")

def top_word_frequency(text_series, top_n=10, dataset_name="Dataset"):
    """Display top frequent words."""
    all_words = " ".join(text_series).split()
    freq = Counter(all_words)
    top_words = dict(freq.most_common(top_n))
    plt.figure(figsize=(8, 4))
    plt.bar(top_words.keys(), top_words.values(), color="teal")
    plt.title(f"üî§ Top {top_n} Frequent Words ‚Äî {dataset_name}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/{dataset_name.lower().replace(' ', '_')}_top_words.png"
    plt.savefig(path, bbox_inches="tight")
    plt.close()
    print(f"‚úÖ Frequency plot saved: {path}")

# ---------- 4. Load Datasets ----------
files = {
    "Tweets": f"{DATA_DIR}/tweets_sample.csv",
    "Reviews": f"{DATA_DIR}/reviews_data.csv",
    "News": f"{DATA_DIR}/news_headlines.csv"
}

datasets = {}
for name, path in files.items():
    if os.path.exists(path):
        datasets[name] = pd.read_csv(path)
        print(f"‚úÖ Loaded {name}: {datasets[name].shape[0]} rows, {datasets[name].shape[1]} cols")
    else:
        print(f"‚ö†Ô∏è File not found: {path}")

# ---------- 5. Preview Data ----------
for name, df in datasets.items():
    print(f"\nüìä Sample from {name}:")
    print(df.head(3))

# ---------- 6. Text Cleaning (Updated & Safe) ----------
cleaned_datasets = {}

for name, df in datasets.items():
    # Try to detect text column automatically
    text_columns = df.select_dtypes(include='object').columns.tolist()
    
    if not text_columns:
        print(f"‚ö†Ô∏è No text column found in {name}, skipping...")
        continue

    # Prefer text columns likely named 'text', 'tweet', 'review', or 'headline'
    candidates = [col for col in text_columns if any(k in col.lower() for k in ['text', 'tweet', 'review', 'headline', 'content'])]
    text_col = candidates[0] if candidates else text_columns[0]

    print(f"üß© Using column '{text_col}' for {name}")
    df["cleaned_text"] = df[text_col].astype(str).apply(clean_text)
    
    cleaned_datasets[name] = df
    print(f"üßπ Cleaned {name} ‚Äî stored in memory")


# ---------- 7. Save Cleaned Datasets ----------
for name, df in cleaned_datasets.items():
    file_name = f"{OUTPUT_DIR}/cleaned_{name.lower()}.csv"
    df.to_csv(file_name, index=False)
    print(f"üíæ Saved cleaned file: {file_name}")

# ---------- 8. Generate Word Clouds ----------
generate_wordcloud(cleaned_datasets["Tweets"]["cleaned_text"], "Tweets WordCloud", "tweets_wordcloud")
generate_wordcloud(cleaned_datasets["Reviews"]["cleaned_text"], "Reviews WordCloud", "reviews_wordcloud")
generate_wordcloud(cleaned_datasets["News"]["cleaned_text"], "News Headlines WordCloud", "news_wordcloud")

# ---------- 9. Word Frequency Charts ----------
top_word_frequency(cleaned_datasets["Tweets"]["cleaned_text"], 10, "Tweets")
top_word_frequency(cleaned_datasets["Reviews"]["cleaned_text"], 10, "Reviews")
top_word_frequency(cleaned_datasets["News"]["cleaned_text"], 10, "News Headlines")

# ---------- 10. Summary ----------
print("\n‚úÖ Data preprocessing complete!")
print("üìÇ Cleaned files and visuals stored in 'outputs/' folder.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhishekgantana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abhishekgantana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Abhishekgantana\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhishekgantana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abhishekgantana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


üìÅ Data Directory: data
üìÇ Outputs Directory: outputs
‚úÖ Loaded Tweets: 300 rows, 6 cols
‚úÖ Loaded Reviews: 200 rows, 6 cols
‚úÖ Loaded News: 250 rows, 5 cols

üìä Sample from Tweets:
   tweet_id username                                         tweet_text  \
0         1   user_1  Great initiative for sustainability and eco pa...   
1         2   user_2   Highly recommend this product ‚Äî excellent value!   
2         3   user_3  Love the new product launch! Great innovation ...   

   likes  retweets   timestamp  
0    553        80  2024-01-01  
1    728       220  2024-01-02  
2    239       246  2024-01-03  

üìä Sample from Reviews:
   review_id      product_name  \
0          1      Smartphone X   
1          2  Smartwatch Elite   
2          3  Smartwatch Elite   

                                       review_text  rating verified_purchase  \
0  The display and design are absolutely stunning.       1               Yes   
1   Sound quality is average, not worth the price.

  plt.tight_layout()
  plt.savefig(path, bbox_inches="tight")


‚úÖ Frequency plot saved: outputs/tweets_top_words.png
‚úÖ Frequency plot saved: outputs/reviews_top_words.png
‚úÖ Frequency plot saved: outputs/news_headlines_top_words.png

‚úÖ Data preprocessing complete!
üìÇ Cleaned files and visuals stored in 'outputs/' folder.
