In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [5]:
df = pd.read_csv('C:\\Users\\LENOVO\\Downloads\\codrelate\\AI-Powered Content Analysis and Recommendation.csv')
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [6]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['number']).columns

print("Categorical Columns: ", categorical_columns)
print("Numerical Columns: ", numerical_columns)

Categorical Columns:  Index(['title', 'text', 'url', 'authors', 'timestamp', 'tags'], dtype='object')
Numerical Columns:  Index([], dtype='object')


In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df['year_month'] = df['timestamp'].dt.to_period('M')
df

  df['year_month'] = df['timestamp'].dt.to_period('M')


Unnamed: 0,title,text,url,authors,timestamp,tags,year_month
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",2020-12
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",2020-09
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",2020-10
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P...",2020-12
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",2020-02
...,...,...,...,...,...,...,...
192363,Why do you need a cleaning service?,What could be more important than having a tid...,https://medium.com/@ozneedcleaningau/why-do-yo...,[],2021-11-16 08:17:08.950000+00:00,"['Cleaning', 'Cleaning Services', 'Cleaning Co...",2021-11
192364,Daily cleaning and maintenance of bedding,Daily cleaning and maintenance of bedding\n\nW...,https://medium.com/@a198blwt/daily-cleaning-an...,[],2021-11-16 05:27:05.359000+00:00,"['Bedding', 'Cleaning', 'Maintain']",2021-11
192365,Beneficial Advice on Bond Cleaning!,The most important chore at the end is bond cl...,https://medium.com/@princegohil/beneficial-adv...,['Prince Shrawan'],2021-11-26 08:20:27.660000+00:00,"['Cleaning', 'End Of Lease Cleaning', 'Cleaners']",2021-11
192366,How I Learned Romanian in 37 Easy Steps,How I Learned Romanian in 37 Easy Steps\n\nHey...,https://medium.com/@lifeinromania/how-i-learne...,['Sam Ursu'],2017-11-27 08:09:19.025000+00:00,"['Romania', 'Language Learning', 'Storyofmylife']",2017-11


In [8]:
df.isnull().sum()

title           5
text            0
url             0
authors         0
timestamp     413
tags            0
year_month    413
dtype: int64

In [9]:
df[numerical_columns] = df[numerical_columns].interpolate()

In [10]:
df['title'].fillna("Unknown", inplace=True)

In [11]:
df.isnull().sum()

title           0
text            0
url             0
authors         0
timestamp     413
tags            0
year_month    413
dtype: int64

In [12]:
import ast

def safe_parse_tags(x):
    if pd.isnull(x): return []
    if isinstance(x, list): return x
    try:
        return ast.literal_eval(x)
    except:
        return []

df['tags'] = df['tags'].apply(safe_parse_tags)

In [13]:
from collections import Counter

# Flatten all tags
flat_tags = [tag for sublist in df['tags'] for tag in sublist]
tag_counts = Counter(flat_tags)

# Keep top N tags
top_tags = set([tag for tag, count in tag_counts.most_common(100)])  # You can adjust 100

# Filter tags
df['tags'] = df['tags'].apply(lambda tag_list: [tag for tag in tag_list if tag in top_tags])

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(df['tags'])
encoded_tag_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)

# Step 3: Drop the original 'tags' column and concat encoded columns
df.drop('tags', axis=1, inplace=True)
df = pd.concat([df.reset_index(drop=True), encoded_tag_df.reset_index(drop=True)], axis=1)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill missing titles
df['title'] = df['title'].fillna("Unknown")

# Initialize vectorizer (adjust max_features as needed)
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')

# Transform titles into TF-IDF vectors
title_tfidf = tfidf.fit_transform(df['title'])

# Convert to DataFrame with same index as df
title_df = pd.DataFrame(title_tfidf.toarray(), 
                        columns=[f"title_tfidf_{col}" for col in tfidf.get_feature_names_out()],
                        index=df.index)

# Drop original title and add tfidf features directly into df
df.drop('title', axis=1, inplace=True)
df[title_df.columns] = title_df


  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.columns] = title_df
  df[title_df.

In [16]:
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# Compute estimated reading time (in minutes)
df['reading_time'] = df['word_count'] / 200

# Optional: round to 2 decimal places
df['reading_time'] = df['reading_time'].round(2)

  df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
  df['reading_time'] = df['word_count'] / 200


In [17]:
df.drop(['text', 'url', 'authors', 'timestamp', 'year_month'], axis=1, inplace=True, errors='ignore')

In [18]:
df

Unnamed: 0,AI,Advice,Art,Artificial Intelligence,Baby,Bitcoin,Blockchain,Books,Business,Careers,...,title_tfidf_wrong,title_tfidf_year,title_tfidf_years,title_tfidf_yes,title_tfidf_york,title_tfidf_young,title_tfidf_youtube,title_tfidf_zero,word_count,reading_time
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,862,4.31
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1094,5.47
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,910,4.55
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,0.10
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,383,1.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192363,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,454,2.27
192364,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,465,2.33
192365,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,333,1.66
192366,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1965,9.82


In [19]:
df.columns

Index(['AI', 'Advice', 'Art', 'Artificial Intelligence', 'Baby', 'Bitcoin',
       'Blockchain', 'Books', 'Business', 'Careers',
       ...
       'title_tfidf_wrong', 'title_tfidf_year', 'title_tfidf_years',
       'title_tfidf_yes', 'title_tfidf_york', 'title_tfidf_young',
       'title_tfidf_youtube', 'title_tfidf_zero', 'word_count',
       'reading_time'],
      dtype='object', length=1102)

In [20]:
df_copy = df.copy()
df_copy.to_csv("model_dataset.csv", index=False)

In [None]:
# Convert timestamp to datetime and extract year_month (optional, dropped later)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')