In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from ast import literal_eval

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import pearsonr



In [None]:
df = pd.read_csv('C:\\Users\\LENOVO\\Downloads\\codrelate\\AI-Powered Content Analysis and Recommendation.csv')
df.head()

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['number']).columns

print("Categorical Columns: ", categorical_columns)
print("Numerical Columns: ", numerical_columns)


In [None]:
print("Shape of data:", df.shape)
df.info()


In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
print("Data types:\n", df.dtypes)

In [None]:
print("\nBasic Dataset Info:\n", df.describe(include='all'))

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')


In [None]:
df['year_month'] = df['timestamp'].dt.to_period('M')


In [None]:
monthly_articles = df.groupby('year_month').size()
plt.figure(figsize=(12, 6))
monthly_articles.plot(kind='line', marker='o', color='royalblue')
plt.title('Articles Published Over Time')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
df['authors'] = df['authors'].apply(literal_eval)


In [None]:
all_authors = [author for sublist in df['authors'] for author in sublist]
top_authors = Counter(all_authors).most_common(10)



In [None]:
author_names, author_counts = zip(*top_authors)
plt.figure(figsize=(10, 5))
sns.barplot(x=list(author_counts), y=list(author_names), palette="viridis")
plt.title('Top 10 Authors by Article Count')
plt.xlabel('Number of Articles')
plt.ylabel('Author')
plt.tight_layout()
plt.show()


In [None]:
df['tags'] = df['tags'].apply(literal_eval)

In [None]:
all_tags = [tag for sublist in df['tags'] for tag in sublist]
top_tags = Counter(all_tags).most_common(10)



In [None]:
tag_names, tag_counts = zip(*top_tags)
plt.figure(figsize=(10, 5))
sns.barplot(x=list(tag_counts), y=list(tag_names), palette="magma")
plt.title('Top 10 Tags')
plt.xlabel('Number of Mentions')
plt.ylabel('Tag')
plt.tight_layout()
plt.show()

In [None]:
df[numerical_columns] = df[numerical_columns].interpolate()

In [None]:
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode()[0])

In [None]:
categorical_columns = ['title', 'authors', 'tags']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        # Apply OneHotEncoder to categorical columns
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

In [None]:
df_transformed = preprocessor.fit_transform(df)

In [None]:
/AI-Powered Content Analysis and Recommendation.csv = {
    'title': [
        "Mental Note Vol. 24", "Your Brain On Coronavirus", "Mind Your Nose", "The 4 Purposes of Dreams",
        "Surviving a Rod Through the Head", "Mentally, Young Adults Are Suffering Most From COVID",
        "How to Turn Your Popular Blog Series Into a Bestselling Book", "Dr Faisal Dar — Pioneer of Liver Transplantation in Pakistan",
        "Sunlight — The Natural Supplement For Our Mental Health", "Occam’s dice", "Is Six Feet of Social Distancing Always Necessary?",
        "Grandma Wants Revenge on Reindeer Who Ran Her Over", "Exploring the question of whether submarines can swim",
        "Predicting American ICU Saturation During COVID-19", "Purge your Followers, Bring an Instagram Account Back to Life"
    ],
    'text': ["Merry Christmas..." * 100, "Your Brain..." * 150, "Mind Your Nose..." * 120, "Passionate about..." * 80,
             "You’ve heard..." * 90, "Mentally, Young..." * 110, "How to Turn..." * 130, "Dr Faisal Dar..." * 140,
             "Sunlight — The..." * 100, "Occam’s dice..." * 160, "6 Feet Apart..." * 115, "Grandma Wants..." * 85,
             "Exploring the..." * 95, "Like everybody..." * 145, "Purge your..." * 105],
    'authors': [['Ryan Fan'], ['Simon Spichak'], [], ['Eshan Samaranayake'], ['Rishav Sinha'], ['Ryan Fan'], ['Frank Mckinley'],
                ['Fatima Arif'], ['Jerren Gan'], ['Dr. Jason Fung'], ['Markham Heid'], ['Ryan Fan'], ['Danielle Boccelli'],
                ['Bj Campbell'], ['Charles Tumiotto Jackson']],
    'timestamp': ['2020-12-26', '2020-09-23', '2020-10-10', '2020-12-21', '2020-02-26', '2020-11-19', '2020-01-28',
                  '2019-02-28', '2020-12-16', '2019-02-20', '2020-07-31', '2020-12-24', '2020-12-20', '2020-04-14',
                  '2020-11-06'],
    'tags': [['Mental Health', 'Health', 'Psychology', 'Science', 'Neuroscience'], ['Mental Health', 'Coronavirus', 'Science', 'Psychology', 'Neuroscience'],
             ['Biotechnology', 'Neuroscience', 'Brain', 'Wellness', 'Science'], ['Health', 'Neuroscience', 'Mental Health', 'Psychology', 'Science'],
             ['Brain', 'Health', 'Development', 'Psychology', 'Science'], ['Society', 'Mental Health', 'Health', 'Nonfiction', 'Coronavirus'],
             ['Books', 'Entrepreneurship', 'Writing', 'Marketing', 'Productivity'], ['People', 'Storyfest', 'Health', 'Pakistan', 'Storytelling'],
             ['Self Improvement', 'Mental Health', 'Health', 'Wellness', 'Science'], ['Longevity', 'Wellness', 'Aging', 'Health', 'Nutrition'],
             ['Health', 'Pandemic', 'Covid 19', 'The Nuance', 'Coronavirus'], ['Social Media', 'Satire', 'Humor', 'Books', 'Music'],
             ['AI', 'Artificial Intelligence', 'Data Science', 'Computer Science', 'NLP'], ['Coronavirus', 'Covid 19', 'Health', 'Media Criticism', 'Random'],
             ['Marketing', 'Business', 'Startup', 'Social Media', 'Instagram']]
}



In [None]:
# Create DataFrame
df = pd.DataFrame(/AI-Powered Content Analysis and Recommendation.csv)
df['text_length'] = df['text'].apply(len)
df['author_count'] = df['authors'].apply(len)
df['tag_count'] = df['tags'].apply(len)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['timestamp_year'] = df['timestamp'].dt.year
df['timestamp_month'] = df['timestamp'].dt.month

In [None]:
df['engagement'] = np.random.randint(50, 500, size=len(df))

In [None]:
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.hist(df['text_length'], bins=10, color='skyblue')
plt.title('Distribution of Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(df['tag_count'], bins=5, color='lightgreen')
plt.title('Distribution of Tag Count')
plt.xlabel('Tag Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='timestamp_year', y='text_length', data=df)
plt.title('Text Length by Year')
plt.show()

In [None]:
# Correlation Heatmap
correlation_matrix = df[['text_length', 'author_count', 'tag_count', 'timestamp_year', 'timestamp_month', 'engagement']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.pairplot(df[['text_length', 'tag_count', 'engagement', 'timestamp_year']], hue='timestamp_year')
plt.show()

In [None]:
# Subplot Dashboard
fig = make_subplots(rows=2, cols=2, subplot_titles=('Text Length Distribution', 'Engagement vs Text Length',
                                                     'Articles by Year', 'Tag Count by Month'))

# Histogram
fig.add_trace(go.Histogram(x=df['text_length'], name='Text Length'), row=1, col=1)

In [None]:
fig.add_trace(go.Scatter(x=df['text_length'], y=df['engagement'], mode='markers', marker=dict(color=df['timestamp_year']),
                         name='Engagement'), row=1, col=2)

In [None]:
year_counts = df['timestamp_year'].value_counts().sort_index()
fig.add_trace(go.Bar(x=year_counts.index, y=year_counts.values, name='Articles'), row=2, col=1)

In [None]:
fig.add_trace(go.Box(x=df['timestamp_month'], y=df['tag_count'], name='Tag Count'), row=2, col=2)

fig.update_layout(height=800, width=1000, title_text="Interactive Dashboard: Article Analysis")
fig.show()

In [None]:
"What factors influence article engagement?")
print("\nInsights:")
# Correlation with engagement
print("Correlation with Engagement:")
print(correlation_matrix['engagement'].sort_values(ascending=False))

In [None]:
yearly_engagement = df.groupby('timestamp_year')['engagement'].mean()
monthly_engagement = df.groupby('timestamp_month')['engagement'].mean()
print("\nAverage Engagement by Year:")
print(yearly_engagement)
print("\nAverage Engagement by Month:")
print(monthly_engagement)

In [None]:
print("\nKey Factors Affecting Engagement:")
print("- Text Length: Longer articles may correlate with higher/lower engagement (check correlation).")
print("- Tag Count: More tags might indicate broader appeal (check scatter plot).")
print("- Publication Timing: Certain months/years show higher engagement (check bar/box plots).")

In [None]:
#feature scaling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from textstat import flesch_reading_ease  # For text complexity
from scipy.stats import pearsonr

# Simulated dataset (replace with your actual preprocessed data)
/AI-Powered Content Analysis and Recommendation.csv = {
    'title': [
        "Mental Note Vol. 24", "Your Brain On Coronavirus", "Mind Your Nose", "The 4 Purposes of Dreams",
        "Surviving a Rod Through the Head", "Mentally, Young Adults Are Suffering Most From COVID",
        "How to Turn Your Popular Blog Series Into a Bestselling Book", "Dr Faisal Dar — Pioneer of Liver Transplantation in Pakistan",
        "Sunlight — The Natural Supplement For Our Mental Health", "Occam’s dice", "Is Six Feet of Social Distancing Always Necessary?",
        "Grandma Wants Revenge on Reindeer Who Ran Her Over", "Exploring the question of whether submarines can swim",
        "Predicting American ICU Saturation During COVID-19", "Purge your Followers, Bring an Instagram Account Back to Life"
    ],
    'text': ["Merry Christmas..." * 100, "Your Brain..." * 150, "Mind Your Nose..." * 120, "Passionate about..." * 80,
             "You’ve heard..." * 90, "Mentally, Young..." * 110, "How to Turn..." * 130, "Dr Faisal Dar..." * 140,
             "Sunlight — The..." * 100, "Occam’s dice..." * 160, "6 Feet Apart..." * 115, "Grandma Wants..." * 85,
             "Exploring the..." * 95, "Like everybody..." * 145, "Purge your..." * 105],
    'authors': [['Ryan Fan'], ['Simon Spichak'], [], ['Eshan Samaranayake'], ['Rishav Sinha'], ['Ryan Fan'], ['Frank Mckinley'],
                ['Fatima Arif'], ['Jerren Gan'], ['Dr. Jason Fung'], ['Markham Heid'], ['Ryan Fan'], ['Danielle Boccelli'],
                ['Bj Campbell'], ['Charles Tumiotto Jackson']],
    'timestamp': ['2020-12-26', '2020-09-23', '2020-10-10', '2020-12-21', '2020-02-26', '2020-11-19', '2020-01-28',
                  '2019-02-28', '2020-12-16', '2019-02-20', '2020-07-31', '2020-12-24', '2020-12-20', '2020-04-14',
                  '2020-11-06'],
    'tags': [['Mental Health', 'Health', 'Psychology', 'Science', 'Neuroscience'], ['Mental Health', 'Coronavirus', 'Science', 'Psychology', 'Neuroscience'],
             ['Biotechnology', 'Neuroscience', 'Brain', 'Wellness', 'Science'], ['Health', 'Neuroscience', 'Mental Health', 'Psychology', 'Science'],
             ['Brain', 'Health', 'Development', 'Psychology', 'Science'], ['Society', 'Mental Health', 'Health', 'Nonfiction', 'Coronavirus'],
             ['Books', 'Entrepreneurship', 'Writing', 'Marketing', 'Productivity'], ['People', 'Storyfest', 'Health', 'Pakistan', 'Storytelling'],
             ['Self Improvement', 'Mental Health', 'Health', 'Wellness', 'Science'], ['Longevity', 'Wellness', 'Aging', 'Health', 'Nutrition'],
             ['Health', 'Pandemic', 'Covid 19', 'The Nuance', 'Coronavirus'], ['Social Media', 'Satire', 'Humor', 'Books', 'Music'],
             ['AI', 'Artificial Intelligence', 'Data Science', 'Computer Science', 'NLP'], ['Coronavirus', 'Covid 19', 'Health', 'Media Criticism', 'Random'],
             ['Marketing', 'Business', 'Startup', 'Social Media', 'Instagram']]
}





In [None]:
# 2. Feature Scaling
# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(minmax_scaler.fit_transform(df[numerical_cols]), columns=[f"{col}_minmax" for col in numerical_cols])

In [None]:

# Standardization
standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df[numerical_cols]), columns=[f"{col}_std" for col in numerical_cols])


In [None]:



# Combine with original DataFrame
df = pd.concat([df, df_minmax, df_standard], axis=1)
print("\nScaled Features (first few rows):")
print(df[[f"{col}_minmax" for col in numerical_cols] + [f"{col}_std" for col in numerical_cols]].head())

In [None]:


# 3. Create New Meaningful Features
# Feature 1: Text Complexity (Flesch Reading Ease Score)
df['text_complexity'] = df['text'].apply(flesch_reading_ease)  # Higher score = easier to read


In [None]:

# Feature 2: Is Weekend Publication (domain knowledge: weekend posts might get more engagement)
df['is_weekend'] = df['timestamp'].dt.dayofweek.isin([5, 6]).astype(int)  # 5=Saturday, 6=Sunday

In [None]:

# Feature 3: Tag Diversity (unique tags as a proxy for topic breadth)
all_tags = set(tag for tags in df['tags'] for tag in tags)
tag_to_idx = {tag: i for i, tag in enumerate(all_tags)}
df['tag_diversity'] = df['tags'].apply(lambda tags: len(set(tags)) / len(all_tags))

In [None]:

# Feature 4: Seasonality (quarter of the year)
df['seasonality'] = df['timestamp'].dt.quarter

In [None]:

# Display new features
print("\nNew Features (first few rows):")
print(df[['text_complexity', 'is_weekend', 'tag_diversity', 'seasonality']].head())


In [None]:
# Re-assess correlations with new features
new_features = ['text_complexity', 'is_weekend', 'tag_diversity', 'seasonality']
new_correlations = {col: pearsonr(df[col], df['engagement'])[0] for col in new_features}
print("\nCorrelations of New Features with Engagement:")
for col, corr in new_correlations.items():
    print(f"{col}: {corr:.3f}")

In [None]:
# Update relevant features
relevant_features.extend([col for col, corr in new_correlations.items() if abs(corr) > 0.1])
print("\nUpdated Relevant Features (abs(corr) > 0.1):", list(set(relevant_features)))