In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Load datasets
print("Loading datasets...")
iris_df = pd.read_csv("/content/sample_data/Iris.csv")
customer_df = pd.read_csv("/content/sample_data/Customer Purchasing Behaviors.csv")

print(f"Customer data: {customer_df.shape}")
print(f"Iris data: {iris_df.shape}")

# Customer data preprocessing
print("\nProcessing customer data...")
customer_df_clean = customer_df.copy()
customer_df_clean.drop(columns=['user_id'], inplace=True)

# Viz 1: Distribution of loyalty scores
plt.figure(figsize=(10, 6))
plt.hist(customer_df_clean['loyalty_score'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Loyalty Score')
plt.ylabel('Frequency')
plt.title('Distribution of Customer Loyalty Scores')
plt.savefig('loyalty_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: loyalty_distribution.png")

# Viz 2: Feature correlations
plt.figure(figsize=(10, 8))
numeric_cols = ['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']
correlation = customer_df_clean[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='summer_r', center=0,
            square=True, linewidths=1, fmt='.2f')
plt.title('Feature Correlation Matrix - Customer Data')
plt.tight_layout()
plt.savefig('customer_correlation.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: customer_correlation.png")

# Prepare data for modeling
customer_df_clean = pd.get_dummies(customer_df_clean, columns=['region'], drop_first=True)
X_cust = customer_df_clean.drop(columns=['loyalty_score'])
y_cust = customer_df_clean['loyalty_score']

X_train_cust, X_test_cust, y_train_cust, y_test_cust = train_test_split(
    X_cust, y_cust, test_size=0.2, random_state=42
)

# Normalize for custom model
X_train_mean = X_train_cust.mean()
X_train_std = X_train_cust.std()
X_train_cust_norm = (X_train_cust - X_train_mean) / X_train_std
X_test_cust_norm = (X_test_cust - X_train_mean) / X_train_std

# Train sklearn model
print("Training sklearn linear regression...")
lin_reg_sklearn = LinearRegression()
lin_reg_sklearn.fit(X_train_cust, y_train_cust)
y_pred_sklearn = lin_reg_sklearn.predict(X_test_cust)
rmse_sklearn = np.sqrt(mean_squared_error(y_test_cust, y_pred_sklearn))
r2_sklearn = lin_reg_sklearn.score(X_test_cust, y_test_cust)

# Train custom model
print("Training custom linear regression...")

class SimpleLinearRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.epochs):
            y_predicted = np.dot(X, self.weights) + self.bias
            dw = (1/n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1/n_samples) * np.sum(y_predicted - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

lin_reg_custom = SimpleLinearRegression(lr=0.01, epochs=5000)
lin_reg_custom.fit(X_train_cust_norm.values.astype(float), y_train_cust.values.astype(float))
y_pred_custom = lin_reg_custom.predict(X_test_cust_norm.values.astype(float))
rmse_custom = np.sqrt(mean_squared_error(y_test_cust, y_pred_custom))

print(f"Sklearn RMSE: {rmse_sklearn:.4f}")
print(f"Custom RMSE: {rmse_custom:.4f}")

# Viz 3: Predictions vs Actual (sklearn)
plt.figure(figsize=(10, 6))
plt.scatter(y_test_cust, y_pred_sklearn, alpha=0.6, edgecolors='k')
plt.plot([y_test_cust.min(), y_test_cust.max()],
         [y_test_cust.min(), y_test_cust.max()],
         'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Loyalty Score')
plt.ylabel('Predicted Loyalty Score')
plt.title(f'Linear Regression: Predictions vs Actual (Scikit-learn)\nRMSE: {rmse_sklearn:.3f}, R²: {r2_sklearn:.3f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('linreg_predictions_sklearn.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: linreg_predictions_sklearn.png")

# Viz 4: Model Comparison
plt.figure(figsize=(10, 6))
models = ['Scikit-learn', 'Custom Implementation']
rmse_values = [rmse_sklearn, rmse_custom]
colors = ['#2ecc71', '#3498db']

bars = plt.bar(models, rmse_values, color=colors, alpha=0.7, edgecolor='black')
plt.ylabel('RMSE (Lower is Better)')
plt.title('Linear Regression: Model Comparison')
plt.ylim(0, max(rmse_values) * 1.2)

for i, (bar, val) in enumerate(zip(bars, rmse_values)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{val:.3f}', ha='center', va='bottom', fontweight='bold')

plt.savefig('linreg_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: linreg_comparison.png")

# Iris data preprocessing
print("\nProcessing iris data...")
iris_df_clean = iris_df.copy()
iris_df_clean.drop(columns=['Id'], inplace=True)

# Viz 5: Petal scatter
plt.figure(figsize=(10, 6))
species_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
for i, species in enumerate(species_names):
    data = iris_df[iris_df['Species'] == species]
    plt.scatter(data['PetalLengthCm'], data['PetalWidthCm'],
                label=species, alpha=0.7, s=50)
plt.xlabel('Petal Length (cm)')
plt.ylabel('Petal Width (cm)')
plt.title('Iris Dataset: Petal Dimensions by Species')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('iris_scatter.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: iris_scatter.png")

# Encode species
label_encoder = LabelEncoder()
iris_df_clean['Species'] = label_encoder.fit_transform(iris_df_clean['Species'])

X_iris = iris_df_clean.drop(columns=['Species'])
y_iris = iris_df_clean['Species']

X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=42
)

# Train sklearn model
print("Training sklearn logistic regression...")
log_reg_sklearn = LogisticRegression(max_iter=200, random_state=42)
log_reg_sklearn.fit(X_train_iris, y_train_iris)
y_pred_sklearn_log = log_reg_sklearn.predict(X_test_iris)
acc_sklearn = accuracy_score(y_test_iris, y_pred_sklearn_log)

# Train custom model
print("Training custom logistic regression...")

class SimpleLogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.weights = {}
        self.bias = {}
        self.classes = None

    def sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)

        for cls in self.classes:
            y_binary = (y == cls).astype(int)
            self.weights[cls] = np.zeros(n_features)
            self.bias[cls] = 0

            for _ in range(self.epochs):
                linear_model = np.dot(X, self.weights[cls]) + self.bias[cls]
                y_predicted = self.sigmoid(linear_model)
                dw = (1/n_samples) * np.dot(X.T, (y_predicted - y_binary))
                db = (1/n_samples) * np.sum(y_predicted - y_binary)
                self.weights[cls] -= self.lr * dw
                self.bias[cls] -= self.lr * db

    def predict(self, X):
        predictions = []
        for sample in X:
            class_scores = {}
            for cls in self.classes:
                linear_model = np.dot(sample, self.weights[cls]) + self.bias[cls]
                class_scores[cls] = self.sigmoid(linear_model)
            predictions.append(max(class_scores, key=class_scores.get))
        return np.array(predictions)

log_reg_custom = SimpleLogisticRegression(lr=0.1, epochs=1000)
log_reg_custom.fit(X_train_iris.values.astype(float), y_train_iris.values.astype(int))
y_pred_custom_log = log_reg_custom.predict(X_test_iris.values.astype(float))
acc_custom = accuracy_score(y_test_iris, y_pred_custom_log)

print(f"Sklearn Accuracy: {acc_sklearn:.4f}")
print(f"Custom Accuracy: {acc_custom:.4f}")

# Viz 6: Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_iris, y_pred_sklearn_log)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True,
            xticklabels=species_names, yticklabels=species_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Logistic Regression: Confusion Matrix (Scikit-learn)\nAccuracy: {acc_sklearn:.3f}')
plt.savefig('logreg_confusion_sklearn.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: logreg_confusion_sklearn.png")

# Viz 7: Model Comparison
plt.figure(figsize=(10, 6))
models = ['Scikit-learn', 'Custom Implementation']
acc_values = [acc_sklearn * 100, acc_custom * 100]
colors = ['#e74c3c', '#9b59b6']

bars = plt.bar(models, acc_values, color=colors, alpha=0.7, edgecolor='black')
plt.ylabel('Accuracy (%)')
plt.title('Logistic Regression: Model Comparison')
plt.ylim(0, 100)

for i, (bar, val) in enumerate(zip(bars, acc_values)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
             f'{val:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.savefig('logreg_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: logreg_comparison.png")

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print("\nLinear Regression:")
print(f"  Scikit-learn RMSE: {rmse_sklearn:.4f}, R²: {r2_sklearn:.4f}")
print(f"  Custom RMSE: {rmse_custom:.4f}")
print("\nLogistic Regression:")
print(f"  Scikit-learn Accuracy: {acc_sklearn:.4f} ({acc_sklearn*100:.2f}%)")
print(f"  Custom Accuracy: {acc_custom:.4f} ({acc_custom*100:.2f}%)")
print("\n" + "="*50)
print("All visualizations saved!")
print("="*50)
print("\nGenerated 7 files:")
print("  1. loyalty_distribution.png")
print("  2. customer_correlation.png")
print("  3. linreg_predictions_sklearn.png")
print("  4. linreg_comparison.png")
print("  5. iris_scatter.png")
print("  6. logreg_confusion_sklearn.png")
print("  7. logreg_comparison.png")

Loading datasets...
Customer data: (238, 7)
Iris data: (150, 6)

Processing customer data...
Saved: loyalty_distribution.png
Saved: customer_correlation.png
Training sklearn linear regression...
Training custom linear regression...
Sklearn RMSE: 0.1860
Custom RMSE: 0.2011
Saved: linreg_predictions_sklearn.png
Saved: linreg_comparison.png

Processing iris data...
Saved: iris_scatter.png
Training sklearn logistic regression...
Training custom logistic regression...
Sklearn Accuracy: 1.0000
Custom Accuracy: 1.0000
Saved: logreg_confusion_sklearn.png
Saved: logreg_comparison.png

SUMMARY

Linear Regression:
  Scikit-learn RMSE: 0.1860, R²: 0.9916
  Custom RMSE: 0.2011

Logistic Regression:
  Scikit-learn Accuracy: 1.0000 (100.00%)
  Custom Accuracy: 1.0000 (100.00%)

All visualizations saved!

Generated 7 files:
  1. loyalty_distribution.png
  2. customer_correlation.png
  3. linreg_predictions_sklearn.png
  4. linreg_comparison.png
  5. iris_scatter.png
  6. logreg_confusion_sklearn.png
 