<a href="https://colab.research.google.com/github/FuzzilyDeveloper/ML_Project/blob/master/EDA_S5e2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, f_oneway
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

train_path = '/content/drive/MyDrive/playground-series-s5e2/train.csv'
test_path = '/content/drive/MyDrive/playground-series-s5e2/test.csv'

# Load data
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Define columns
numerical_cols = ['Compartments', 'Weight Capacity (kg)']
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

In [None]:
# prompt: analyse the df_train and predict the price in df_test by using a gemini pretrained model

# Assuming necessary libraries are already installed and imported in the preceding code.
# If not, uncomment and run the following lines:

!pip install transformers
from transformers import pipeline

# Initialize the Gemini model pipeline for text classification
# Replace 'google/flan-t5-xl' with the appropriate Gemini model identifier if available.
# As of now, a readily accessible Gemini model via Hugging Face Transformers is not public.
# Therefore, using a similar large language model.
# Please check HuggingFace model hub for any Gemini models that may be available in the future.
classifier = pipeline("text-classification", model="google/flan-t5-xl")

# Create dummy text features from other features (replace with actual meaningful feature engineering)
df_train['text_features'] = df_train['Brand'].astype(str) + " " + df_train['Material'].astype(str) + " " + df_train['Size'].astype(str)
df_test['text_features'] = df_test['Brand'].astype(str) + " " + df_test['Material'].astype(str) + " " + df_test['Size'].astype(str)


# Function to predict price based on text features
def predict_price(text):
  try:
      result = classifier(text)
      # Assuming the model output has a label and score. Adjust accordingly if needed.
      # Here, we return the score as the price prediction. Adapt as needed for your model.
      return result[0]['score']
  except Exception as e:
      print(f"Error during prediction: {e}")
      return 0  # Or handle the error appropriately

# Predict prices for the training set
df_train['predicted_price'] = df_train['text_features'].apply(predict_price)

# Predict prices for the test set
df_test['predicted_price'] = df_test['text_features'].apply(predict_price)

# Print the first few rows of predictions
print(df_train[['text_features', 'predicted_price']].head())
print(df_test[['text_features', 'predicted_price']].head())


# Further analysis and model training (example using linear regression)
#from sklearn.linear_model import LinearRegression
#from sklearn.metrics import mean_squared_error

#model = LinearRegression()
#X = df_train['predicted_price'].values.reshape(-1, 1)
#y = df_train['Price']
#model.fit(X, y)

#df_test['Predicted_price_final'] = model.predict(df_test['predicted_price'].values.reshape(-1,1))

#print(df_test['Predicted_price_final'].head())



In [None]:
# Install required libraries
!pip install pandas matplotlib seaborn scipy

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
import numpy as np

# Set plot style for better visualization
sns.set(style="whitegrid")

# Load training data
train_path = '/content/drive/MyDrive/playground-series-s5e2/train.csv'
df_train = pd.read_csv(train_path)

# Extract Price column
prices = df_train['Price']

# Calculate skewness
price_skewness = skew(prices)
print(f"Skewness of Price: {price_skewness:.3f}")

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Histogram with KDE
sns.histplot(prices, bins=50, kde=True, color='skyblue', ax=ax1)
ax1.set_title('Price Distribution (Histogram + KDE)')
ax1.set_xlabel('Price')
ax1.set_ylabel('Frequency')

# Plot 2: Boxplot for spread and outliers
sns.boxplot(x=prices, color='lightgreen', ax=ax2)
ax2.set_title('Price Distribution (Boxplot)')
ax2.set_xlabel('Price')

# Adjust layout
plt.tight_layout()

# Save the plot
plt.savefig('/content/drive/MyDrive/playground-series-s5e2/price_distribution.png')
plt.show()

# Summary statistics for Price
print("\nPrice Summary Statistics:")
print(prices.describe())

# Categorize prices into budget, mid-range, premium (based on quartiles)
q25, q50, q75 = prices.quantile([0.25, 0.5, 0.75])
print(f"\nPrice Ranges (based on quartiles):")
print(f"Budget: <= {q25:.2f}")
print(f"Mid-range: {q25:.2f} to {q75:.2f}")
print(f"Premium: > {q75:.2f}")

# Count entries in each category
budget_count = sum(prices <= q25)
midrange_count = sum((prices > q25) & (prices <= q75))
premium_count = sum(prices > q75)
print(f"\nPrice Category Counts:")
print(f"Budget: {budget_count} ({budget_count/len(prices)*100:.1f}%)")
print(f"Mid-range: {midrange_count} ({midrange_count/len(prices)*100:.1f}%)")
print(f"Premium: {premium_count} ({premium_count/len(prices)*100:.1f}%)")

In [None]:


# 1. Basic Data Overview
print("Training Data Info:")
print(df_train.info())
print("\nTraining Data Head:")
print(df_train.head())
print("\nSummary Statistics (Numerical Columns):")
print(df_train[numerical_cols].describe())
print("\nValue Counts for Categorical Columns:")
for col in categorical_cols:
    print(f"\n{col}:\n{df_train[col].value_counts()}")

In [None]:


# 2. Outlier Detection
plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_cols + ['Price']):
    plt.subplot(1, len(numerical_cols) + 1, i + 1)
    sns.boxplot(y=df_train[col])
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()

In [None]:


def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers

for col in numerical_cols + ['Price']:
    outliers = detect_outliers(df_train, col)
    print(f"\nOutliers in {col} (count: {len(outliers)}):\n{outliers}")

In [None]:


# 3. Correlation Analysis
# Numerical correlations
numerical_data = df_train[numerical_cols + ['Price']]
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix (Numerical Features)")
plt.show()
print("\nCorrelations with Price (Numerical Features):")
print(correlation_matrix['Price'].sort_values(ascending=False))

In [None]:



# Categorical correlations using ANOVA
print("\nANOVA Test for Categorical Variables with Price:")
def anova_correlation(df, cat_col, target_col):
    groups = [group[target_col].values for _, group in df.groupby(cat_col) if len(group) > 0]
    if len(groups) > 1:
        f_stat, p_value = f_oneway(*groups)
        ss_between = sum(len(g) * (np.mean(g) - df[target_col].mean())**2 for g in groups)
        ss_total = sum((df[target_col] - df[target_col].mean())**2)
        eta_squared = ss_between / ss_total if ss_total > 0 else 0
        return f_stat, p_value, eta_squared
    return np.nan, np.nan, np.nan

correlations = []
for col in categorical_cols:
    f_stat, p_value, eta_squared = anova_correlation(df_train, col, 'Price')
    print(f"\n{col}:")
    print(f"  F-statistic: {f_stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Eta-squared: {eta_squared:.4f}")
    correlations.append((col, eta_squared))

# Rank by eta-squared
correlations.sort(key=lambda x: x[1] if not np.isnan(x[1]) else -1, reverse=True)
print("\nCategorical Variables Ranked by Eta-squared:")
for col, eta in correlations:
    print(f"{col}: {eta:.4f}")

In [None]:


# Mean Price by categorical variable
for col in categorical_cols:
    grouped = df_train.groupby(col)['Price'].mean().sort_values()
    print(f"\nMean Price by {col}:\n{grouped}")
    plt.figure(figsize=(10, 4))
    grouped.plot(kind='bar')
    plt.title(f"Mean Price by {col}")
    plt.ylabel("Price")
    plt.xticks(rotation=45)
    plt.show()


In [None]:

# 4. Distribution Analysis
plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_cols + ['Price']):
    plt.subplot(1, len(numerical_cols) + 1, i + 1)
    sns.histplot(df_train[col], kde=True)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

for col in numerical_cols + ['Price']:
    skewness = skew(df_train[col].dropna())
    print(f"Skewness of {col}: {skewness:.4f}")

In [None]:


# 5. Missing Values
print("\nMissing Values in Training Data:")
print(df_train.isnull().sum())
print("\nMissing Values in Test Data:")
print(df_test.isnull().sum())

In [None]:


# 6. Feature Importance (PCA Loadings)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', sparse_output=False))
        ]), categorical_cols),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ])

X_train_processed = preprocessor.fit_transform(df_train.drop(columns=['Price']))
feature_names = preprocessor.get_feature_names_out()
pca = PCA(n_components=12)
X_train_reduced = pca.fit_transform(X_train_processed)
loadings = pd.DataFrame(pca.components_.T, index=feature_names, columns=[f"PC{i+1}" for i in range(12)])
print("\nPCA Loadings (Top 5 features per component):")
for col in loadings.columns:
    top_features = loadings[col].abs().sort_values(ascending=False).head(5)
    print(f"\n{col}:\n{top_features}")

In [None]:


# 7. Test Data Consistency
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.kdeplot(df_train[col], label='Train')
    sns.kdeplot(df_test[col], label='Test')
    plt.title(f"Distribution of {col} (Train vs Test)")
    plt.legend()
    plt.show()

print("\nEDA Completed.")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set random seed for reproducibility
np.random.seed(8)

# Load data
train_path = '/content/drive/MyDrive/playground-series-s5e2/train.csv'
test_path = '/content/drive/MyDrive/playground-series-s5e2/test.csv'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Remove rows with missing values from df_train
df_train = df_train.dropna()

# Downsample if needed
if len(df_train) > 5000:
    df_train = df_train.sample(n=5000, random_state=8)

# Split data
df_train_split, df_val = train_test_split(df_train, test_size=0.2, random_state=10)

# Define columns
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']
target = 'Price'

# Ensure target exists
if target not in df_train.columns:
    raise ValueError("Target column 'Price' not found in dataset.")

# 1. Inspect for Errors and Noise
print("=== Data Summary ===")
print(df_train.describe(include='all'))
print("\n=== Missing Values ===")
print(df_train.isnull().sum())

# Outliers: Boxplot for numerical columns and target
plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_cols + [target], 1):
    plt.subplot(1, 3, i)
    sns.boxplot(y=df_train[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

# 2. Analyze Feature Relevance
# Encode categorical variables for correlation/feature importance
df_encoded = pd.get_dummies(df_train[categorical_cols], drop_first=True)
df_encoded = pd.concat([df_train[numerical_cols + [target]], df_encoded], axis=1)

# Spearman correlation (suitable for non-linear relationships)
correlations = {}
for col in df_encoded.columns:
    if col != target:
        corr, _ = spearmanr(df_encoded[col], df_encoded[target])
        correlations[col] = corr
print("\n=== Spearman Correlations with Price ===")
for col, corr in sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True):
    print(f"{col}: {corr:.3f}")

# Feature Importance via Random Forest
X = df_encoded.drop(columns=[target])
y = df_encoded[target]
rf = RandomForestRegressor(n_estimators=100, random_state=8)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n=== Feature Importance (Random Forest) ===")
print(importances)

# 3. Examine Data Distribution
# Histograms for numerical columns and target
plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_cols + [target], 1):
    plt.subplot(1, 3, i)
    sns.histplot(df_train[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Check skewness
print("\n=== Skewness ===")
for col in numerical_cols + [target]:
    print(f"{col}: {df_train[col].skew():.3f}")

# 4. Assess Dataset Size and Diversity
print(f"\n=== Dataset Size ===")
print(f"Training set: {len(df_train)} rows")
print(f"Unique values in categorical columns:")
for col in categorical_cols:
    print(f"{col}: {df_train[col].nunique()} unique values")

# 5. Check for Multicollinearity (numerical features)
X_num = df_train[numerical_cols]
vif_data = pd.DataFrame()
vif_data["Feature"] = numerical_cols
vif_data["VIF"] = [variance_inflation_factor(X_num.values, i) for i in range(X_num.shape[1])]
print("\n=== Variance Inflation Factor (VIF) ===")
print(vif_data)

# 6. Cross-Validation for Consistency
kf = KFold(n_splits=5, shuffle=True, random_state=8)
mse_scores = []
for train_idx, val_idx in kf.split(df_train):
    train_fold = df_train.iloc[train_idx]
    val_fold = df_train.iloc[val_idx]

    # Simple preprocessing for CV
    X_train = pd.get_dummies(train_fold[categorical_cols], drop_first=True)
    X_train = pd.concat([train_fold[numerical_cols], X_train], axis=1)
    y_train = train_fold[target]
    X_val = pd.get_dummies(val_fold[categorical_cols], drop_first=True)
    X_val = pd.concat([val_fold[numerical_cols], X_val], axis=1)
    y_val = val_fold[target]

    # Align columns
    X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=8)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = np.mean((y_val - y_pred) ** 2)
    mse_scores.append(mse)

print("\n=== Cross-Validation MSE ===")
print(f"MSE per fold: {mse_scores}")
print(f"Mean MSE: {np.mean(mse_scores):.3f}, Std: {np.std(mse_scores):.3f}")

# 7. Baseline Comparison
baseline_pred = np.mean(df_train[target])
baseline_mse = np.mean((df_train[target] - baseline_pred) ** 2)
print("\n=== Baseline MSE (Mean Predictor) ===")
print(f"Baseline MSE: {baseline_mse:.3f}")

# 8. Visualize Feature-Target Relationships
# Scatter plots for numerical features vs. Price
plt.figure(figsize=(12, 4))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(1, 2, i)
    sns.scatterplot(x=df_train[col], y=df_train[target])
    plt.title(f"{col} vs. Price")
plt.tight_layout()
plt.show()

# Boxplots for categorical features vs. Price
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df_train[col], y=df_train[target])
    plt.title(f"{col} vs. Price")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()