# Kunskapskontroll

### Fyll i uppgifterna nedan innan du lämnar in på LearnPoint: 
Namn på samtliga gruppmedlemmar: 

Asami, Guy, Mustafa,  Pawel, Vidar

# Code

##### Importer

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.svm import SVR


##### Läsa in CSV

In [None]:
housing = pd.read_csv('data/housing.csv')


#### Överblicka datan

In [None]:
print(housing.info())

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.suptitle('Distribution of Housing Data Features')  # Adds a title above all subplots
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()
plt.title('California Housing Prices and Population Distribution')
plt.show()


In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
plt.figure(figsize=(10,7))
sns.scatterplot(x="longitude", y="latitude", hue="ocean_proximity", 
                size="population", sizes=(20,200), alpha=0.4, 
                palette="muted", data=housing)

# Show the legend
plt.legend()

# Add a title to the plot
plt.title('Geographical Distribution of Housing by Ocean Proximity and Population Size')

# Display the plot
plt.show()


#### Förbered data
- Ta bort > 500,000 från kolumnen median_house_value
- Ta bort > 51 från kolumnen median_house_age
- Konvertera ocean_proximity med ordinal encoding (1-4), ta bort ISLAND
- Fyll tomma värden med medelvärde/median i kolumnen total_bedrooms

In [None]:
# Filter the data and assign it to a new DataFrame
filtered_housing = housing[(housing['median_house_value'] <= 500000) & (housing['housing_median_age'] <= 51) & (housing['ocean_proximity'] != 'ISLAND')].copy()


In [None]:
category_mapping = {'INLAND': 0, '<1H OCEAN': 1, 'NEAR OCEAN': 2, 'NEAR BAY': 3}
categories = list(category_mapping.keys())
encoder = OrdinalEncoder(categories=[categories])
filtered_housing['ocean_proximity_encoded'] = encoder.fit_transform(filtered_housing[['ocean_proximity']])


In [None]:
# Calculate the mean of the 'total_bedrooms' column in filtered_housing
mean_total_bedrooms = filtered_housing['total_bedrooms'].mean()

# Fill the missing values with the mean in filtered_housing
filtered_housing.loc[:, 'total_bedrooms'] = filtered_housing['total_bedrooms'].fillna(mean_total_bedrooms)


##### Jämföra ny data med hjälp av tabell och diagram
 


In [None]:
# Before making changes, save a copy of the original DataFrame for comparison
original_housing = housing.copy()

print("Original Data:")
display(original_housing.describe())
print("\nFiltered Data:")
display(filtered_housing.describe())

print(filtered_housing.info())

# Visualize the changes in 'median_house_value'
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(original_housing['median_house_value'], bins=50, kde=True)
plt.title('Original Median House Value Distribution')
plt.subplot(1, 2, 2)
sns.histplot(filtered_housing['median_house_value'], bins=50, kde=True)
plt.title('Filtered Median House Value Distribution')
plt.show()

# Visualize the changes in 'housing_median_age'
plt.figure(figsize=(12, 6))

# Plot for the original housing data
plt.subplot(1, 2, 1)
sns.histplot(original_housing['housing_median_age'], bins=50, kde=True)
plt.title('Original Housing Median Age Distribution')

# Plot for the filtered housing data
plt.subplot(1, 2, 2)
sns.histplot(filtered_housing['housing_median_age'], bins=50, kde=True)
plt.title('Filtered Housing Median Age Distribution')

# Display the plots
plt.show()

# Visualize the changes in 'ocean_proximity'
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
original_housing['ocean_proximity'].value_counts().plot(kind='bar')
plt.title('Original Ocean Proximity Counts')
plt.subplot(1, 2, 2)
filtered_housing['ocean_proximity_encoded'].value_counts().plot(kind='bar')
plt.title('Encoded Ocean Proximity Counts')
plt.show()





### Hitta vilka variablar/features att ha med testdata 

In [None]:
housing_float = filtered_housing.copy()
housing_float = housing_float.drop(['ocean_proximity'], axis=1)
corr_matrix = housing_float.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True)

#### Prova nya variablar

In [None]:
housing_variables = housing_float.copy()

housing_variables["rooms_per_household"] = housing_variables["total_rooms"]/housing_variables["households"]
housing_variables["bedrooms_per_room"] = housing_variables["total_bedrooms"]/housing_variables["total_rooms"]
housing_variables["population_per_household"]=housing_variables["population"]/housing_variables["households"]
# Jag lade till featuresen nedan för att testa (GB), population_per_bedroom i synnerhet verkar lovande
# housing_variables["bedrooms_per_household"] = housing_variables["total_bedrooms"]/housing_variables["households"]
housing_variables["population_per_bedroom"]=housing_variables["population"]/housing_variables["total_bedrooms"]


In [None]:

corr_matrix = housing_variables.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True,)

In [None]:
corr_matrix = housing_variables.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)


In [None]:
housing_variables.info()

#### Använda RandomForestRegressor för att hitta de viktigaste variablarna/features


In [None]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

X = housing_variables.drop('median_house_value', axis=1)
y = housing_variables['median_house_value']

# Split the data into training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# First, split your data into a larger training set and a test set
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split your larger training set into a smaller training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)



# Skapa och träna en Random Forest-regressor
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

# Hämta feature importance
importance = rf_reg.feature_importances_

# Hämta kolumnnamnen från din ursprungliga dataset
feature_names = X_train.columns


# Sort the features by importance
sorted_idx = np.argsort(importance)
sorted_feature_names = feature_names[sorted_idx]
sorted_importance = importance[sorted_idx]

# Create a clearer plot
plt.figure(figsize=(12, 8))  # Increase the figure size
plt.barh(sorted_feature_names, sorted_importance, color='skyblue')  # Use horizontal bars
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.tight_layout()  # Adjust the layout to fit everything nicely
plt.show()

# Börja träna modeller

In [22]:
final_data = housing_variables[['median_house_value', 'median_income', 'ocean_proximity_encoded', 'population_per_household']].copy()
final_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 18570 entries, 0 to 20639
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   median_house_value        18570 non-null  float64
 1   median_income             18570 non-null  float64
 2   ocean_proximity_encoded   18570 non-null  float64
 3   population_per_household  18570 non-null  float64
dtypes: float64(4)
memory usage: 725.4 KB


In [23]:
final_ll_data = housing_variables[['median_house_value', 'median_income', 'ocean_proximity_encoded', 'population_per_household', 'longitude', 'latitude']].copy()
final_ll_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18570 entries, 0 to 20639
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   median_house_value        18570 non-null  float64
 1   median_income             18570 non-null  float64
 2   ocean_proximity_encoded   18570 non-null  float64
 3   population_per_household  18570 non-null  float64
 4   longitude                 18570 non-null  float64
 5   latitude                  18570 non-null  float64
dtypes: float64(6)
memory usage: 1015.5 KB


### Linear Regression

In [32]:
X = final_ll_data.drop('median_house_value', axis=1)
y = final_ll_data['median_house_value']

# Split the data into training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# First, split your data into a larger training set and a test set
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split your larger training set into a smaller training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [33]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [34]:
lin_reg_pred_val = lin_reg.predict(X_val)
mean_squared_error(y_val, lin_reg_pred_val, squared=False)

61273.37754680553

### Support Vector Regressor