In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import plot_tree
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('Downloads/denver_cpi.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nColumn dtypes:")
print(df.dtypes)
print("\nFirst few rows:")
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Data preparation
# Create the target variable based on percentChangeYear
df['price_increased'] = df['percentChangeYear'] > 0

# Filter out rows with NaN in percentChangeYear
df = df.dropna(subset=['percentChangeYear'])

# Convert target to numeric (0/1) for modeling
df['price_increased'] = df['price_increased'].astype(int)

# Display the class distribution
print("\nClass distribution of target variable:")
print(df['price_increased'].value_counts())
print(f"Percentage of price increases: {df['price_increased'].mean() * 100:.2f}%")

# Create feature matrix and target vector
# Select numerical and categorical features that might be useful for prediction
# Exclude features that leak information or are irrelevant
numeric_features = ['periodYear', 'cpi']
categorical_features = ['areaType', 'periodType', 'type', 'dataRegion']

# Make sure all features exist in the dataset
numeric_features = [col for col in numeric_features if col in df.columns]
categorical_features = [col for col in categorical_features if col in df.columns]

# Sanity check
print("\nUsing numeric features:", numeric_features)
print("Using categorical features:", categorical_features)

# Create feature matrix X and target vector y
X = df[numeric_features + categorical_features]
y = df['price_increased']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Exploratory Data Analysis
print("\nSummary statistics for numerical features:")
print(df[numeric_features].describe())

# Plot the distribution of the target variable
plt.figure(figsize=(8, 5))
sns.countplot(x='price_increased', data=df)
plt.title('Distribution of Price Increases')
plt.xlabel('Price Increased (1=Yes, 0=No)')
plt.ylabel('Count')
plt.savefig('target_distribution.png')
plt.close()

# Plot the relationship between periodYear and cpi
plt.figure(figsize=(12, 6))
sns.lineplot(x='periodYear', y='cpi', data=df, hue='areaType')
plt.title('CPI Trend Over Years by Area Type')
plt.savefig('cpi_trend.png')
plt.close()

# Correlation analysis
if len(df.select_dtypes(include=['float64', 'int64']).columns) > 0:
    plt.figure(figsize=(10, 8))
    numerical_df = df.select_dtypes(include=['float64', 'int64'])
    if 'price_increased' in numerical_df.columns:
        correlation = numerical_df.corr()
        sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
        plt.title('Correlation Matrix of Numerical Features')
        plt.savefig('correlation_matrix.png')
        plt.close()

        print("\nCorrelation with target variable:")
        print(correlation['price_increased'].sort_values(ascending=False))

# 1. Decision Tree Model with Hyperparameter Tuning
print("\n--- Decision Tree Classifier ---")

# Create pipeline with preprocessing and model
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define hyperparameter grid
dt_param_grid = {
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Perform grid search
dt_grid_search = GridSearchCV(
    dt_pipeline, 
    dt_param_grid, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1,
    verbose=1
)

dt_grid_search.fit(X_train, y_train)

# Best hyperparameters
print("\nBest hyperparameters for Decision Tree:")
print(dt_grid_search.best_params_)

# Evaluate on test set
dt_best = dt_grid_search.best_estimator_
dt_y_pred = dt_best.predict(X_test)

print("\nDecision Tree Test Accuracy:", accuracy_score(y_test, dt_y_pred))
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_y_pred))

# Confusion Matrix
dt_cm = confusion_matrix(y_test, dt_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(dt_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Increase', 'Increase'],
            yticklabels=['No Increase', 'Increase'])
plt.title('Decision Tree Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('dt_confusion_matrix.png')
plt.close()

# 2. Logistic Regression with Hyperparameter Tuning
print("\n--- Logistic Regression ---")

# Create pipeline with preprocessing and model
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Define hyperparameter grid
lr_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['liblinear', 'saga']
}

# Perform grid search
lr_grid_search = GridSearchCV(
    lr_pipeline, 
    lr_param_grid, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1,
    verbose=1
)

lr_grid_search.fit(X_train, y_train)

# Best hyperparameters
print("\nBest hyperparameters for Logistic Regression:")
print(lr_grid_search.best_params_)

# Evaluate on test set
lr_best = lr_grid_search.best_estimator_
lr_y_pred = lr_best.predict(X_test)

print("\nLogistic Regression Test Accuracy:", accuracy_score(y_test, lr_y_pred))
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_y_pred))

# Confusion Matrix
lr_cm = confusion_matrix(y_test, lr_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Increase', 'Increase'],
            yticklabels=['No Increase', 'Increase'])
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('lr_confusion_matrix.png')
plt.close()

# Compare models
models = ['Decision Tree', 'Logistic Regression']
accuracies = [accuracy_score(y_test, dt_y_pred), accuracy_score(y_test, lr_y_pred)]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Comparison - Test Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig('model_comparison.png')
plt.close()

# Feature importance for Decision Tree - FIXED CODE FOR FEATURE IMPORTANCE
if hasattr(dt_best['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    # Alternative approach for getting feature names
    try:
        # First, get the one-hot encoded feature names
        cat_features = []
        for feature in categorical_features:
            # Apply the transformer to get categories
            encoder = preprocessor.named_transformers_['cat']
            if hasattr(encoder, 'get_feature_names_out'):
                # For newer scikit-learn versions
                encoded_features = encoder.get_feature_names_out([feature])
                cat_features.extend(encoded_features)
            else:
                # For older versions
                cat_features.extend([f"{feature}_{i}" for i in range(encoder.transform(X[[feature]].drop_duplicates()).shape[1])])
        
        # Combine with numeric features
        all_features = numeric_features + cat_features
        
        # Get feature importances
        feature_importances = dt_best['classifier'].feature_importances_
        
        # Create a simpler version that doesn't rely on exact feature names matching
        # Just use indices and generic feature names
        importance_df = pd.DataFrame({
            'Feature': [f"Feature_{i}" for i in range(len(feature_importances))],
            'Importance': feature_importances
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
        plt.title('Top 15 Feature Importances (Decision Tree)')
        plt.tight_layout()
        plt.savefig('dt_feature_importance.png')
        plt.close()
        
        print("\nTop 10 most important features (by index):")
        print(importance_df.head(10))
        
    except Exception as e:
        print(f"\nError in feature importance extraction: {e}")
        print("Continuing with analysis...")

print("\nAnalysis complete! Models have been trained and evaluated.")

Dataset shape: (4349, 17)

Column dtypes:
stateFips                  int64
area                       int64
areaType                   int64
period                     int64
periodYear                 int64
periodType                 int64
periodTypeDescription     object
cpi                      float64
title                     object
type                       int64
source                     int64
cpiSourceDescription      object
percentChangeYear        float64
percentChangeMonth       float64
dataRegion                object
areaName                  object
areaDescription           object
dtype: object

First few rows:
   stateFips  area  areaType  period  periodYear  periodType  \
0          0     0         0       0        1913           1   
1          0     0         0       0        1913           1   
2          0     0         0       1        1913           3   
3          0     0         0       1        1913           3   
4          0     0         0       2        19

<Figure size 1000x800 with 0 Axes>