# DS 4400 Homework 1 - Code
## Ariv Ahuja
### January 2026

In [None]:
import pandas as pd
import numpy as np

## Problem 4.2: Random Matrices and Inverses

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

print("="*80)
print("PROBLEM 4.2: Random Matrices and Inverses")
print("="*80)

for i in range(3):
    # Generate random 3x3 matrix with integers from -10 to 10
    M = np.random.randint(-10, 11, size=(3, 3))
    print(f"\nMatrix {i+1}:")
    print(M)
    
    # Compute determinant
    det = np.linalg.det(M)
    print(f"\nDeterminant: {det:.4f}")
    
    # Compute inverse if it exists
    if abs(det) > 1e-10:
        M_inv = np.linalg.inv(M)
        print(f"\nInverse:")
        print(M_inv)
        print(f"\nM @ M^(-1) (should be identity):")
        print(np.round(M @ M_inv, 10))
    else:
        print("\nMatrix is singular, no inverse exists.")
    
    print("\n" + "-"*40)

## Problem 5: Average, Variance, and Correlation

In [None]:
# Load the data
df = pd.read_csv('kc_house_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")

In [None]:
# Remove id, date, and zipcode columns as instructed
cols_to_remove = ['id', 'date', 'zipcode']
df_analysis = df.drop(columns=cols_to_remove)

# Get feature columns (excluding price which is the response)
features = [col for col in df_analysis.columns if col != 'price']
print(f"Features to analyze: {features}")

### Part 1: Feature Statistics

In [None]:
# Compute statistics for each feature
stats = {}
for col in features:
    stats[col] = {
        'average': df_analysis[col].mean(),
        'min': df_analysis[col].min(),
        'max': df_analysis[col].max(),
        'variance': df_analysis[col].var()
    }

# Create and display statistics DataFrame
stats_df = pd.DataFrame(stats).T
stats_df.columns = ['Average', 'Min', 'Max', 'Variance']
print("Feature Statistics:")
print(stats_df.to_string())

In [None]:
# Find lowest and highest average
avg_sorted = sorted(stats.items(), key=lambda x: x[1]['average'])
print(f"\nFeature with LOWEST average: {avg_sorted[0][0]} = {avg_sorted[0][1]['average']:.4f}")
print(f"Feature with HIGHEST average: {avg_sorted[-1][0]} = {avg_sorted[-1][1]['average']:.4f}")

In [None]:
# Find lowest and highest variance
var_sorted = sorted(stats.items(), key=lambda x: x[1]['variance'])
print(f"\nFeature with LOWEST variance: {var_sorted[0][0]} = {var_sorted[0][1]['variance']:.4f}")
print(f"Feature with HIGHEST variance: {var_sorted[-1][0]} = {var_sorted[-1][1]['variance']:.4f}")

### Part 2: Correlation with Response (price)

In [None]:
# Compute correlation coefficient of each feature with price
correlations = {}
for col in features:
    correlations[col] = df_analysis[col].corr(df_analysis['price'])

# Create and display correlation DataFrame
corr_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation with Price'])
corr_df = corr_df.sort_values('Correlation with Price', ascending=False)
print("Correlation Coefficients with Price:")
print(corr_df.to_string(index=False))

In [None]:
# Find positively correlated features
pos_corr = {k: v for k, v in correlations.items() if v > 0}
print(f"\nPositively correlated features ({len(pos_corr)} total):")
for feature in pos_corr.keys():
    print(f"  - {feature}")

In [None]:
# Find feature with highest positive correlation
max_corr_feature = max(correlations.items(), key=lambda x: x[1])
print(f"\nFeature with HIGHEST positive correlation: {max_corr_feature[0]} = {max_corr_feature[1]:.6f}")

### Part 3: Negative Correlations

In [None]:
# Find negatively correlated features
neg_corr = {k: v for k, v in correlations.items() if v < 0}

if neg_corr:
    print(f"Features with NEGATIVE correlation ({len(neg_corr)} total):")
    for feature, corr in neg_corr.items():
        print(f"  - {feature}: {corr:.6f}")
else:
    print("No features have a negative correlation with price.")
    print("All 17 features are positively correlated with the response variable.")