In [1]:
import pandas as pd

# Load the CSV
df = pd.read_csv("cleaned_preprocessed_data.csv")
print(df.head())



                                                text    sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! ðŸ’ª          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             timestamp            user     platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     hashtags  retweets  likes       country  \
0   #Nature #Park                                  15.0   30.0     USA         
1   #Traffic #Morning                               5.0   10.0     Canada      
2   #Fitnes

In [5]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include='number')

# Summary statistics for numeric features
numeric_summary = numeric_cols.describe().T  # Transpose for easier viewing

# Add median and variance
numeric_summary['median'] = numeric_cols.median()
numeric_summary['variance'] = numeric_cols.var()

print(numeric_summary)

# Summary statistics for categorical features   
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    print(f"Value counts for {col}:\n{df[col].value_counts()}\n")


                   count         mean         std     min      25%     50%  \
retweets           732.0    21.508197    7.061286     5.0    17.75    22.0   
likes              732.0    42.901639   14.089848    10.0    34.75    43.0   
year               732.0  2020.471311    2.802285  2010.0  2019.00  2021.0   
month              732.0     6.122951    3.411763     1.0     3.00     6.0   
day                732.0    15.497268    8.474553     1.0     9.00    15.0   
hour               732.0    15.521858    4.113414     0.0    13.00    16.0   
text_encoded       732.0   348.054645  205.842295     0.0   170.75   344.5   
sentiment_encoded  732.0   138.405738   73.735566     0.0    78.00   139.0   
timestamp_encoded  732.0   334.312842  195.330508     0.0   167.75   330.0   
user_encoded       732.0   343.538251  197.974711     0.0   171.00   343.5   
platform_encoded   732.0     1.173497    1.044146     0.0     0.00     1.0   
hashtags_encoded   732.0   349.203552  200.283495     0.0   174.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# -----------------------------
# Step 0: Setup
# -----------------------------
output_dir = 'EDA_Outputs'
os.makedirs(output_dir, exist_ok=True)

# Note: df is already loaded from a previous cell

# Quick overview
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nSummary Statistics:")
print(df.describe())

# -----------------------------
# Step 2: Summary Statistics
# -----------------------------
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

numeric_summary = df[numeric_cols].describe().T
numeric_summary['median'] = df[numeric_cols].median()
numeric_summary['variance'] = df[numeric_cols].var()
numeric_summary['skewness'] = df[numeric_cols].skew()
numeric_summary['missing'] = df[numeric_cols].isnull().sum()
numeric_summary.to_csv(os.path.join(output_dir, 'numeric_summary.csv'))

print("\nNumeric Summary:")
print(numeric_summary)

# Categorical counts
cat_counts = {}
for col in categorical_cols:
    counts = df[col].value_counts()
    cat_counts[col] = counts
    counts.to_csv(os.path.join(output_dir, f'{col}_value_counts.csv'))
    print(f"\nValue counts for {col}:\n{counts}")

# -----------------------------
# Step 3: Visualizations
# -----------------------------
# Histograms
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.savefig(os.path.join(output_dir, f'hist_{col}.png'))
    plt.close()

# Boxplots
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.savefig(os.path.join(output_dir, f'box_{col}.png'))
    plt.close()

# Scatter plots (pairwise for first 5 numeric columns, adjust as needed)
pair_cols = numeric_cols[:5]  # first 5 numeric columns
for i in range(len(pair_cols)):
    for j in range(i+1, len(pair_cols)):
        plt.figure(figsize=(6,4))
        sns.scatterplot(x=df[pair_cols[i]], y=df[pair_cols[j]])
        plt.title(f'{pair_cols[i]} vs {pair_cols[j]}')
        plt.savefig(os.path.join(output_dir, f'scatter_{pair_cols[i]}_vs_{pair_cols[j]}.png'))
        plt.close()

# -----------------------------
# Step 4: Correlation Analysis
# -----------------------------
corr_matrix = df[numeric_cols].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Matrix')
plt.savefig(os.path.join(output_dir, 'correlation_matrix.png'))
plt.close()

# Top correlations
high_corr = corr_matrix.unstack().sort_values(ascending=False)
high_corr = high_corr[high_corr < 1].drop_duplicates()

# -----------------------------
# Step 5: Insights Report
# -----------------------------
insights = []

# Numeric insights
for col in numeric_cols:
    skew_flag = " (High Skew!)" if abs(df[col].skew()) > 1 else ""
    outliers = df[col][(df[col] < df[col].quantile(0.01)) | (df[col] > df[col].quantile(0.99))]
    outlier_flag = f" ({len(outliers)} potential outliers)" if len(outliers) > 0 else ""
    insights.append(f"{col} - Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}, Variance: {df[col].var():.2f}, Skewness: {df[col].skew():.2f}{skew_flag}{outlier_flag}")

# Top correlations
insights.append("\nTop 5 correlations (excluding 1.0):")
insights.extend([f"{idx[0]} & {idx[1]}: {val:.2f}" for idx, val in high_corr.head(5).items()])

# Categorical insights
insights.append("\nCategorical Columns Insights:")
for col in categorical_cols:
    top_cat = df[col].value_counts().idxmax()
    top_freq = df[col].value_counts().max()
    insights.append(f"{col} - Most frequent: {top_cat} ({top_freq} occurrences)")

# Save report
report_path = os.path.join(output_dir, 'EDA_Report.txt')
with open(report_path, 'w') as f:
    for line in insights:
        f.write(line + "\n")

print(f"\nEDA complete! All outputs saved in '{output_dir}' folder.")



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               732 non-null    object 
 1   sentiment          732 non-null    object 
 2   timestamp          732 non-null    object 
 3   user               732 non-null    object 
 4   platform           732 non-null    object 
 5   hashtags           732 non-null    object 
 6   retweets           732 non-null    float64
 7   likes              732 non-null    float64
 8   country            732 non-null    object 
 9   year               732 non-null    int64  
 10  month              732 non-null    int64  
 11  day                732 non-null    int64  
 12  hour               732 non-null    int64  
 13  text_encoded       732 non-null    int64  
 14  sentiment_encoded  732 non-null    int64  
 15  timestamp_encoded  732 non-null    int64  
 16  user_encoded