In [1]:
import pandas as pd
import numpy as np
#import seaborn as sns 
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV data into a DataFrame
data = pd.read_csv("Data.csv")

# Print a summary of the data (column names, data types, non-null values, etc.)
data.info()

# Drop rows with missing values (inplace=True modifies the DataFrame)
data.dropna(inplace=True)

# Print the summary again to see how missing values were handled
data.info()

# Import train_test_split from scikit-learn for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# Separate features (X) from the target variable (Y)
X = data.drop(["median_house_value"], axis=1)
Y = data["median_house_value"]

# Split data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Combine training features and target variable back into a DataFrame
train_data = X_train.join(Y_train)

# Create dummy variables for the categorical feature "ocean_proximity"
# Convert them to integers for compatibility with some machine learning models
dummy_ocean_proximity = pd.get_dummies(train_data.ocean_proximity).astype(int)

# Join the dummy variables back to the training data
train_data = train_data.join(dummy_ocean_proximity)

# Drop the original categorical feature "ocean_proximity" since it's now encoded in dummy variables
train_data = train_data.drop(['ocean_proximity'],axis=1)

# Visualize the distribution of features using histograms
train_data.hist(figsize=(15,8))

# Calculate the correlation matrix to see how features are related to each other
train_data.corr()

# Create a heatmap to visualize the correlation matrix (commented out, using seaborn heatmap instead)
# plt.figure(figsize=(15,8))
# sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu")

# Create a heatmap to visualize the correlation matrix using seaborn
plt.figure(figsize=(15,8))
sns.heatmap(train_data.corr(), annot=True, fmt=".2f", cmap="RdBu_r")  # Adjust ".2f" for desired decimal places
plt.show()

# Apply logarithmic transformation to some features to handle skewness
train_data['total_rooms'] = np.log(train_data['total_rooms']+1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms']+1)
train_data['population'] = np.log(train_data['population']+1)
train_data['households'] = np.log(train_data['households']+1)

# Visualize the distribution of features after transformation (histograms)
train_data.hist(figsize=(15,8))

# Create another heatmap to see how correlations changed after transformation
plt.figure(figsize=(15,8))
sns.heatmap(train_data.corr(),annot=True,cmap="RdBu_r")

# Create a scatterplot to visualize the relationship between latitude, longitude, and median house value
sns.scatterplot(x='latitude',y='longitude',data=train_data, hue='median_house_value', palette='coolwarm')

# Feature engineering: Create new features based on existing ones
train_data['bedroom_ratio'] = train_data['total_bedrooms'] / train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms'] / train_data['households']

# Create another heatmap to see how correlations changed after feature engineering
plt.figure(figsize=(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="RdBu_r")

# Import LinearRegression from scikit-learn for linear regression modeling
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Separate features from target variable again for training
X_train, Y_train = train_data.drop(['median_house_value'], axis=1), train_data['median_house_value']


# Create a linear regression model
reg = LinearRegression()

# Train the model on the prepared training data
reg.fit(X_train, Y_train)

# Make predictions on the unseen test data using the trained model
predictions = reg.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(Y_test, predictions)
print("Mean Squared Error (MSE) on test data:", mse)

# Calculate R-squared
r2 = r2_score(Y_test, predictions)
print("R-squared on test data:", r2)

EmptyDataError: No columns to parse from file

In [None]:
import pandas as pd
data = pd.read_csv("Data.csv")