In [2]:
# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN

print('TensorFlow/Keras: %s' % tf.__version__)
print('Keras: %s' % keras.__version__)

# Data manipulation
import pandas as pd
import numpy as np
import math

print('pandas:%s' % pd.__version__)
print('numpy: %s' % np.__version__)

# Scikit-learn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

print('scikit-learn: %s' % sklearn.__version__)

# Visualization

import seaborn as sns

print('Seaborn: %s' % sns.__version__)


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
import pandas as pd

# Read data
df = pd.read_csv('Housing.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
print(df.isna().sum()) # shows missing value

In [None]:
# Can try'median', 'most_frequent', 'constant'
imputer = SimpleImputer(strategy='mean') 
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
explicit_titles = {
    'CRIM': 'Per capita crime rate by town',
    'ZN': 'Proportion of residential land zoned for lots over 25,000 sq.ft',
    'INDUS': 'Proportion of non-retail business acres per town',
    'CHAS': 'Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)',
    'NOX': 'Nitric oxides concentration (parts per 10 million)',
    'RM': 'Average number of rooms per dwelling',
    'AGE': 'Proportion of owner-occupied units built prior to 1940',
    'DIS': 'Weighted distances to five Boston employment centres',
    'RAD': 'Index of accessibility to radial highways',
    'TAX': 'Full-value property-tax rate per $10,000',
    'PTRATIO': 'Pupil-teacher ratio by town',
    'B': '1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town',
    'LSTAT': '% Lower status of the population',
    'MEDV': 'Median value of owner-occupied homes in $1000s'
}

In [None]:
import matplotlib.pyplot as plt

# set a different color palette for each column
palette = sns.color_palette("magma", len(df.columns))

for i, col in enumerate(df.columns):
    sns.histplot(data=df, x=col, color=palette[i])
    plt.title(explicit_titles[col])  # use the new column name as the title
    
    
    plt.show()


In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform it
df_normalized = scaler.fit_transform(df)

# Convert the normalized data back to a DataFrame
df_normalized = pd.DataFrame(df_normalized, columns=df.columns)


# feature -> MEDV
x= df_normalized.iloc[:,:-1]
y= df_normalized['MEDV']



In [None]:

x= df_normalized.iloc[:,:-1]
y= df_normalized['MEDV']

fig = plt.figure(figsize=(20,10))
for i, col in enumerate(x.columns):
    plt.subplot(3,5,i+1)
    plt.scatter(x[col], y, s=10)
    plt.xlabel(col)
    plt.ylabel('MEDV')


In [None]:
from sklearn.linear_model import LinearRegression

# 80/20 train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
reg = LinearRegression().fit(x_train, y_train)
score = reg.score(x_test, y_test)
print("R^2 score: ", score)



In [None]:
import matplotlib.pyplot as plt

# Train the model on the training data and store MSE values at each iteration
mse_values = []
for i in range(1, len(x_train)):
    model.fit(x_train[:i], y_train[:i])
    y_pred = model.predict(x_test)
    mse_values.append(mean_squared_error(y_test, y_pred))

# Plot the MSE values against the iteration number
plt.plot(range(1, len(x_train)), mse_values)
plt.xlabel('Number of Iterations')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error vs Number of Iterations')
plt.show()

In [None]:

from sklearn.metrics import r2_score
# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


# Convert to NumPy arrays
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Reshape the inputs for the neural network
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

# Model architecture
model = tf.keras.models.Sequential([
  tf.keras.layers.LSTM(128, activation='relu', input_shape=(x_train.shape[1], 1)),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(x_train, y_train, epochs=100, batch_size=16, validation_data=(x_test, y_test), verbose=0)

# Plot the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')

# Plot the training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')

plt.xlabel('Epoch')
plt.ylabel('Loss/Accuracy')
plt.title('Training and Validation Loss/Accuracy')
plt.legend()
plt.show()








In [None]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(x_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [x_train.columns[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(x_train.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(x_train.shape[1]), names, rotation=90)

# Show plot
plt.show()

# Try different numbers of n_estimators - this will take a minute or so
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(x_train, y_train)
    scores.append(model.score(x_test, y_test))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)


In [None]:
scores

In [None]:
np.mean(scores)