# Exploratory Data Analysis — Personal Stylist Based on Mood & Weather
This Colab notebook performs EDA on the provided dataset `alldata.csv`. The notebook structure mirrors your sample notebook (same number of cells).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print('Imported libraries')


In [None]:
# Load dataset with robust encoding fallback
from pathlib import Path
path = Path('/mnt/data/alldata.csv')
encodings = ['utf-8', 'latin1', 'iso-8859-1']
for e in encodings:
    try:
        df = pd.read_csv(path, encoding=e)
        print('Loaded with encoding:', e)
        break
    except Exception as exc:
        print('Failed with', e, '->', type(exc).__name__)
else:
    raise ValueError('Could not read alldata.csv with tried encodings')

df.head()


In [None]:
# Dataset shape and info
print('Shape:', df.shape)
df.info()


In [None]:
# Descriptive statistics for numeric columns
df.select_dtypes(include=[np.number]).describe()


In [None]:
# Missing values per column
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing>0]


In [None]:
# Categorical value counts (first 5 categorical cols)
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols[:5]:
    print('\n', c)
    print(df[c].value_counts().head(10))


In [None]:
# Parse date columns if present
date_cols = [c for c in df.columns if 'date' in c.lower()]
if date_cols:
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], errors='coerce')
    print('Parsed dates:', date_cols)
else:
    print('No date columns detected')


In [None]:
# Create derived features (month, dayofweek) if date exists
if date_cols:
    df['month'] = df[date_cols[0]].dt.month
    df['dayofweek'] = df[date_cols[0]].dt.dayofweek
    display(df[['month','dayofweek']].head())
else:
    print('Skipping date-derived features')


In [None]:
# Plot distributions of first 4 numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols[:4]:
    plt.figure(figsize=(6,3))
    sns.histplot(df[c].dropna(), kde=True)
    plt.title(f'Distribution of {c}')
    plt.show()


In [None]:
# Boxplots for first 4 numeric columns
for c in num_cols[:4]:
    plt.figure(figsize=(6,2))
    sns.boxplot(x=df[c].dropna())
    plt.title(f'Boxplot of {c}')
    plt.show()


In [None]:
# Correlation heatmap for numeric features
if len(num_cols) > 1:
    plt.figure(figsize=(8,6))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
else:
    print('Not enough numeric cols for correlation')


In [None]:
# Mood vs Temperature (if columns exist)
mood_col = [c for c in df.columns if 'mood'==c.lower()]
temp_col = [c for c in df.columns if 'temp' in c.lower()]
if mood_col and temp_col:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=mood_col[0], y=temp_col[0], data=df)
    plt.title('Temperature by Mood')
    plt.show()
else:
    print('Mood-temp plot skipped')


In [None]:
# Recommendations frequency (if present)
rec_cols = [c for c in df.columns if 'recommend' in c.lower() or 'top' in c.lower() or 'footwear' in c.lower()]
if rec_cols:
    for c in rec_cols[:3]:
        print('\nTop values for', c)
        display(df[c].value_counts().head(10))
else:
    print('No recommendation-like columns')


In [None]:
# Grouped analysis: mean rating by weather/mood
if 'rating' in df.columns:
    grp = ['weather' if 'weather' in df.columns else None, 'mood' if 'mood' in df.columns else None]
    grp = [g for g in grp if g]
    if grp:
        display(df.groupby(grp)['rating'].mean().reset_index().head(10))
    else:
        print('No grouping columns present')
else:
    print('No rating column')


In [None]:
# Simple imputation: numeric->median, categorical->Unknown
for c in df.select_dtypes(include=[np.number]).columns:
    df[c] = df[c].fillna(df[c].median())
for c in df.select_dtypes(include=['object','category']).columns:
    df[c] = df[c].fillna('Unknown')
print('Imputation done')


In [None]:
# Weather distribution pie chart (if weather exists)
if 'weather' in df.columns:
    plt.figure(figsize=(6,6))
    df['weather'].value_counts().plot.pie(autopct='%1.1f%%')
    plt.title('Weather Distribution')
    plt.ylabel('')
    plt.show()
else:
    print('No weather column')


In [None]:
# Save cleaned dataset
out_path = '/mnt/data/alldata_cleaned_for_eda.csv'
df.to_csv(out_path, index=False)
print('Cleaned data saved to', out_path)


---
**Notes:** This notebook runs a basic exploratory analysis: data info, missing values, distributions, correlations, and grouped summaries. Next steps: feature engineering and predictive modeling.


# Exploratory Data Analysis — Personal Stylist Based on Mood & Weather
This Colab notebook performs EDA on the provided dataset `alldata.csv`. The notebook structure mirrors your sample notebook (same number of cells).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print('Imported libraries')


In [None]:
# Load dataset
path = '/mnt/data/alldata.csv'
df = pd.read_csv(path)
df.head()


In [None]:
# Dataset shape and info
print('Shape:', df.shape)
df.info()


In [None]:
# Descriptive statistics for numeric columns
df.select_dtypes(include=[np.number]).describe()


In [None]:
# Missing values per column
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing>0]


In [None]:
# Categorical value counts (first 5 categorical cols)
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols[:5]:
    print('\n', c)
    print(df[c].value_counts().head(10))


In [None]:
# Parse date columns if present
date_cols = [c for c in df.columns if 'date' in c.lower()]
if date_cols:
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], errors='coerce')
    print('Parsed dates:', date_cols)
else:
    print('No date columns detected')


In [None]:
# Create derived features (month, dayofweek) if date exists
if date_cols:
    df['month'] = df[date_cols[0]].dt.month
    df['dayofweek'] = df[date_cols[0]].dt.dayofweek
    display(df[['month','dayofweek']].head())
else:
    print('Skipping date-derived features')


In [None]:
# Plot distributions of first 4 numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols[:4]:
    plt.figure(figsize=(6,3))
    sns.histplot(df[c].dropna(), kde=True)
    plt.title(f'Distribution of {c}')
    plt.show()


In [None]:
# Boxplots for first 4 numeric columns
for c in num_cols[:4]:
    plt.figure(figsize=(6,2))
    sns.boxplot(x=df[c].dropna())
    plt.title(f'Boxplot of {c}')
    plt.show()


In [None]:
# Correlation heatmap for numeric features
if len(num_cols) > 1:
    plt.figure(figsize=(8,6))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
else:
    print('Not enough numeric cols for correlation')


In [None]:
# Mood vs Temperature (if columns exist)
mood_col = [c for c in df.columns if 'mood'==c.lower()]
temp_col = [c for c in df.columns if 'temp' in c.lower()]
if mood_col and temp_col:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=mood_col[0], y=temp_col[0], data=df)
    plt.title('Temperature by Mood')
    plt.show()
else:
    print('Mood-temp plot skipped')


In [None]:
# Recommendations frequency (if present)
rec_cols = [c for c in df.columns if 'recommend' in c.lower() or 'top' in c.lower() or 'footwear' in c.lower()]
if rec_cols:
    for c in rec_cols[:3]:
        print('\nTop values for', c)
        display(df[c].value_counts().head(10))
else:
    print('No recommendation-like columns')


In [None]:
# Grouped analysis: mean rating by weather/mood
if 'rating' in df.columns:
    grp = ['weather' if 'weather' in df.columns else None, 'mood' if 'mood' in df.columns else None]
    grp = [g for g in grp if g]
    if grp:
        display(df.groupby(grp)['rating'].mean().reset_index().head(10))
    else:
        print('No grouping columns present')
else:
    print('No rating column')


In [None]:
# Simple imputation: numeric->median, categorical->Unknown
for c in df.select_dtypes(include=[np.number]).columns:
    df[c] = df[c].fillna(df[c].median())
for c in df.select_dtypes(include=['object','category']).columns:
    df[c] = df[c].fillna('Unknown')
print('Imputation done')


In [None]:
# Weather distribution pie chart (if weather exists)
if 'weather' in df.columns:
    plt.figure(figsize=(6,6))
    df['weather'].value_counts().plot.pie(autopct='%1.1f%%')
    plt.title('Weather Distribution')
    plt.ylabel('')
    plt.show()
else:
    print('No weather column')


In [None]:
# Save cleaned dataset
out_path = '/mnt/data/alldata_cleaned_for_eda.csv'
df.to_csv(out_path, index=False)
print('Cleaned data saved to', out_path)


---
**Notes:** This notebook runs a basic exploratory analysis: data info, missing values, distributions, correlations, and grouped summaries. Next steps: feature engineering and predictive modeling.


# Exploratory Data Analysis — Personal Stylist Based on Mood & Weather
This Colab notebook performs EDA on the provided dataset `alldata.csv`. The notebook structure mirrors your sample notebook (same number of cells).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
print('Imported libraries')


In [None]:
# Load dataset
path = '/mnt/data/alldata.csv'
df = pd.read_csv(path)
df.head()


In [None]:
# Dataset shape and info
print('Shape:', df.shape)
df.info()


In [None]:
# Descriptive statistics for numeric columns
df.select_dtypes(include=[np.number]).describe()


In [None]:
# Missing values per column
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing>0]


In [None]:
# Categorical value counts (first 5 categorical cols)
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols[:5]:
    print('\n', c)
    print(df[c].value_counts().head(10))


In [None]:
# Parse date columns if present
date_cols = [c for c in df.columns if 'date' in c.lower()]
if date_cols:
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], errors='coerce')
    print('Parsed dates:', date_cols)
else:
    print('No date columns detected')


In [None]:
# Create derived features (month, dayofweek) if date exists
if date_cols:
    df['month'] = df[date_cols[0]].dt.month
    df['dayofweek'] = df[date_cols[0]].dt.dayofweek
    display(df[['month','dayofweek']].head())
else:
    print('Skipping date-derived features')


In [None]:
# Plot distributions of first 4 numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols[:4]:
    plt.figure(figsize=(6,3))
    sns.histplot(df[c].dropna(), kde=True)
    plt.title(f'Distribution of {c}')
    plt.show()


In [None]:
# Boxplots for first 4 numeric columns
for c in num_cols[:4]:
    plt.figure(figsize=(6,2))
    sns.boxplot(x=df[c].dropna())
    plt.title(f'Boxplot of {c}')
    plt.show()


In [None]:
# Correlation heatmap for numeric features
if len(num_cols) > 1:
    plt.figure(figsize=(8,6))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
else:
    print('Not enough numeric cols for correlation')


In [None]:
# Mood vs Temperature (if columns exist)
mood_col = [c for c in df.columns if 'mood'==c.lower()]
temp_col = [c for c in df.columns if 'temp' in c.lower()]
if mood_col and temp_col:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=mood_col[0], y=temp_col[0], data=df)
    plt.title('Temperature by Mood')
    plt.show()
else:
    print('Mood-temp plot skipped')


In [None]:
# Recommendations frequency (if present)
rec_cols = [c for c in df.columns if 'recommend' in c.lower() or 'top' in c.lower() or 'footwear' in c.lower()]
if rec_cols:
    for c in rec_cols[:3]:
        print('\nTop values for', c)
        display(df[c].value_counts().head(10))
else:
    print('No recommendation-like columns')
