# Text Mining & Text Analytics

### *Objective*:
To apply text mining techniques to perform document classification. You will train a machine learning model to distinguish between two types of posts from Reddit: those related to Data Science and those related to Game of Thrones. The goal is to explore how text mining can be used for categorizing documents and gain insights into real-world applications like spam filtering, sentiment analysis, and topic detection.

In [66]:
# Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %pip install -q wordcloud nltk seaborn matplotlib pandas numpy
import nltk
import random
import string
import re
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package stopwords to /home/roxel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/roxel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/roxel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/roxel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Since I couldn't find an existing dataset that has reddit posts about both, data_science & GOT, I've decided to make things a little interesting and creating my own dataset from existing datasets on reddit posts about data_science & GOT. 

In [67]:
# %pip install kagglehub
import kagglehub

# Download the datasets from Kaggle
# path_a = kagglehub.dataset_download("nikhilkhetan/game-of-thrones")
# print("Path to GOT dataset:", path_a)

# path_b = kagglehub.dataset_download("maksymshkliarevskyi/reddit-data-science-posts")
# print("Path to DataSci dataset:", path_b)

In [68]:
# Load the datasets
path_a = '../data/GameofThrones.csv'
path_b = '../data/reddit_database.csv'

got_df = pd.read_csv(path_a)
ds_df = pd.read_csv(path_b)

In [69]:
# inspecting columns to merge both datasets
print(ds_df.columns)
print(got_df.columns)


Index(['created_date', 'created_timestamp', 'subreddit', 'title', 'id',
       'author', 'author_created_utc', 'full_link', 'score', 'num_comments',
       'num_crossposts', 'subreddit_subscribers', 'post'],
      dtype='object')
Index(['title', 'score', 'id', 'url', 'comms_num', 'created', 'body',
       'timestamp'],
      dtype='object')


Matching Columns: {title, id, score, post:body}

In [70]:
ds_df['text'] = ds_df['title'].fillna('') + ' ' + ds_df['post'].fillna('')
got_df['text'] = got_df['title'].fillna('') + ' ' + got_df['body'].fillna('')


In [71]:
s_df = ds_df[ds_df['text'].str.strip() != '']
got_df = got_df[got_df['text'].str.strip() != '']


In [72]:
n = min(len(ds_df), len(got_df), 5000)  # to select a minimum of 5000 samples from each dataset or less if available
ds_sample = ds_df.sample(n=n, random_state=42)[['text']].copy()
got_sample = got_df.sample(n=n, random_state=42)[['text']].copy()

In [73]:
ds_sample['category'] = 'data science'
got_sample['category'] = 'game of thrones'

In [74]:
base = pd.concat([ds_sample, got_sample], ignore_index=True)
base = base.sample(frac=1, random_state=42).reset_index(drop=True)

In [75]:
base.to_csv('../data/reddit_posts.csv', index=False)

In [76]:
# Loading the dataset
# df = pd.read_csv('../data/reddit_posts.csv')
# not needed in my case right now, but keeping it for future reference


In [77]:
# analyzing the dataset
print("Dataset shape:", base.shape)
print("Dataset columns:", base.columns)
print("Dataset info:", base.info())
print(base.head(5))


Dataset shape: (3918, 2)
Dataset columns: Index(['text', 'category'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3918 entries, 0 to 3917
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      3918 non-null   object
 1   category  3918 non-null   object
dtypes: object(2)
memory usage: 61.3+ KB
Dataset info: None
                                                text         category
0  Seeking resources to have a deeper understandi...     data science
1  [Spoilers] Finally, someone says that Daenerys...  game of thrones
2  Modelling process for logistic regression I'm ...     data science
3         [Spoilers] The 3 Sides of Jamie Lannister   game of thrones
4  [NO SPOILERS] is it just me or are knights nev...  game of thrones


Since I, myself created the dataset, I know that it doesn't have any null-values

In [78]:
def remove_emojis(text):
     """
     Remove emojis from the given text.
     """
     emoji_pattern = re.compile(
     "["
     u"\U0001F600-\U0001F64F" # Emoticons
     u"\U0001F300-\U0001F5FF" # Symbols & pictographs
     u"\U0001F680-\U0001F6FF" # Transport & map symbols
     u"\U0001F700-\U0001F77F" # Alchemical symbols
     u"\U0001F780-\U0001F7FF" # Geometric shapes extended
     u"\U0001F800-\U0001F8FF" # Supplemental arrows-C
     u"\U0001F900-\U0001F9FF" # Supplemental symbols & pictographs
     u"\U0001FA00-\U0001FA6F" # Chess symbols
     u"\U0001FA70-\U0001FAFF" # Symbols and pictographs extended-A
     u"\U00002702-\U000027B0" # Dingbats
     u"\U000024C2-\U0001F251" # Enclosed characters
     "]+",
     flags=re.UNICODE,
     )
     return emoji_pattern.sub(r'', text)


In [79]:
# Func to preprocess the data
def preprocess_text(text):
     """
     Perform preprocessing steps on a given text.
     """
     # Lowercase the text
     text = text.lower()
     # Remove URLs
     text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
     # Remove emojis
     text = remove_emojis(text)
     # Remove punctuation and special characters
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     # Tokenize the text
     tokens = nltk.word_tokenize(text)
     # Remove stopwords
     stop_words = set(stopwords.words('english'))
     tokens = [word for word in tokens if word not in stop_words]
     # Lemmatize tokens
     lemmatizer = WordNetLemmatizer()
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     # Rejoin tokens into a single string
     return ' '.join(tokens)

In [80]:
clean_df= base
clean_df['text'] = clean_df['text'].apply(preprocess_text)
print("cleaned dataset")
print("=================")
print(clean_df.head(10))

cleaned dataset
                                                text         category
0  seeking resource deeper understanding cnns int...     data science
1  spoiler finally someone say daeneryss arc nowhere  game of thrones
2  modelling process logistic regression im wonde...     data science
3                       spoiler side jamie lannister  game of thrones
4  spoiler knight never used properly game throne...  game of thrones
5            comment nothing memorable winter coming  game of thrones
6  hey everyone made gaze tracker thought id shar...     data science
7  comment submission automatically removed post ...  game of thrones
8              top highest paying technology company     data science
9  comment polarized ending internet youre going ...  game of thrones


I'm label-encoding the category column because it's binary. 

In [81]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
clean_df['label'] = le.fit_transform(clean_df['category'])
# 0: 'data science', 1: 'game of thrones'


In [82]:
print(clean_df.head(10))

                                                text         category  label
0  seeking resource deeper understanding cnns int...     data science      0
1  spoiler finally someone say daeneryss arc nowhere  game of thrones      1
2  modelling process logistic regression im wonde...     data science      0
3                       spoiler side jamie lannister  game of thrones      1
4  spoiler knight never used properly game throne...  game of thrones      1
5            comment nothing memorable winter coming  game of thrones      1
6  hey everyone made gaze tracker thought id shar...     data science      0
7  comment submission automatically removed post ...  game of thrones      1
8              top highest paying technology company     data science      0
9  comment polarized ending internet youre going ...  game of thrones      1


In [83]:
# Feature extraction: Convert text into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clean_df['text'])

In [84]:
import joblib
joblib.dump(vectorizer, '../data/tfidf_vectorizer.pkl')

['../data/tfidf_vectorizer.pkl']

In [None]:

# Labels (Data Science = 0, Game of Thrones = 1)
y = clean_df['category'].map({'data science': 0, 'game of thrones': 1})

y = y.dropna()

In [86]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [87]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = {
     'naive_bayes': MultinomialNB(),
     'logistic_regression': LogisticRegression(max_iter=1200),
     'svm': SVC(kernel='linear')  
}


In [88]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_and_select_best_model(models):
     best_model = None
     best_f1 = 0
     results = []

     # Step 3: Train and evaluate each model
     for name, model in models.items():
          model.fit(X_train, y_train)
          y_pred = model.predict(X_test)

          acc = accuracy_score(y_test, y_pred)
          prec = precision_score(y_test, y_pred)
          rec = recall_score(y_test, y_pred)
          f1 = f1_score(y_test, y_pred)

          results.append((name, acc, prec, rec, f1))

          print(f"\n📊 {name}")
          print(f"  Accuracy:  {acc:.4f}")
          print(f"  Precision: {prec:.4f}")
          print(f"  Recall:    {rec:.4f}")
          print(f"  F1 Score:  {f1:.4f}")

          # Update best model
          if f1 > best_f1:
               best_f1 = f1
               best_model = model
     
     print("\n✅ Best model based on F1-score:", type(best_model).__name__)
     return best_model, vectorizer, results


In [89]:
best_model, vectorizer, model_results = train_and_select_best_model(models)


📊 naive_bayes
  Accuracy:  0.9796
  Precision: 0.9623
  Recall:    0.9974
  F1 Score:  0.9795

📊 logistic_regression
  Accuracy:  0.9974
  Precision: 0.9974
  Recall:    0.9974
  F1 Score:  0.9974

📊 svm
  Accuracy:  0.9987
  Precision: 0.9974
  Recall:    1.0000
  F1 Score:  0.9987

✅ Best model based on F1-score: SVC


In [91]:
# save best model
joblib.dump(best_model, '../data/best_model.pkl')
# save label encoder
joblib.dump(le, '../data/label_encoder.pkl')

['../data/label_encoder.pkl']

In [93]:
import pandas as pd

# Load saved model, vectorizer, and label encoder
with open('../data/best_model.pkl', 'rb') as f:
     model = joblib.load(f)

with open('../data/tfidf_vectorizer.pkl', 'rb') as f:
     vectorizer = joblib.load(f)

with open('../data/label_encoder.pkl', 'rb') as f:
     label_encoder = joblib.load(f)

# Define your prediction function
def predict_category(texts):
     """
     Takes a list of raw text strings, vectorizes them, and returns predicted categories.
     """
     if isinstance(texts, str):
          texts = [texts]  # convert single string to list

     # Vectorize the input texts
     X_new = vectorizer.transform(texts)

     # Predict label
     predictions = model.predict(X_new)

     # Decode label to original category name
     categories = label_encoder.inverse_transform(predictions)
     return categories


In [None]:
# single prediction
print(predict_category("How to build a neural network with PyTorch?"))

# batch predictions
test_posts = [
     "Spoiler alert: Arya kills the Night King!",
     "Best way to visualize time series data using Python?"
]

print(predict_category(test_posts))


['data science']
['game of thrones' 'data science']
