https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb
https://colab.research.google.com/github/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb#scrollTo=E-hlmIU5tN3P


# Tutorial 1 (housing) Data Exploration and Visualization (Getting to know your data)

## Setup

In [29]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required


In [30]:
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

In [31]:
# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request
import seaborn as sns


In [32]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Get the data

With this code, we download and store the tgz file and extract it on the same location of the notebook and sub folders "datasets/housing/"

In [None]:
data = pd.read_csv("US_Accidents_Dec21_updated.csv")

In [None]:
data.head()

In [None]:
#This method prints information about a DataFrame including the dtype and columns, non-null values and memory usage.
data.info() 
#Q2 Can you tell if there are any missing values? If yes, which attribuites contain missing values ? 

All attributes are numerical, except the ocean_proximity field.

In [None]:
#This method shows a statistical summary of the numerical attributes 
data.describe().T

In [None]:
data.isnull().sum()

The count, mean, min, and max rows are self-explanatory. Note that the null values are ignored (so, for example, the count of total_bedrooms is 20,433, not 20,640).

The 25%, 50%, and 75% rows show the corresponding percentiles: a percentile indicates the value below which a given percentage of observations in a group of observations fall.

In [None]:
#To plot a histogram for each numerical attribute
data.hist(bins=50, figsize=(20,15))
plt.show()
# Q4 List down the main observations you noted from the statistical summary and the histogram

In [None]:
cities = data.City.unique()
len(cities)

In [None]:
numOfAccidentsPerCity = data.City.value_counts()
numOfAccidentsPerCity.head(20)

In [None]:
Weatherconditions = data.Weather_Condition.value_counts().reset_index()
Weatherconditions.columns = ['Weather_Condition','Accidents']
Weatherconditions['Percentage'] = round(Weatherconditions['Accidents'] * 100 /Weatherconditions['Accidents'].sum() , 2)
Weatherconditions.head(10)

In [None]:
Severitycounts = data.Severity.value_counts().reset_index()
Severitycounts.columns = ['Severity','Accidents']
Severitycounts['Percentage'] = round(Severitycounts['Accidents'] * 100 /Severitycounts['Accidents'].sum() , 2)
Severitycounts.head()

In [None]:
table = data

# Correlations and Visualizations


In [None]:
# Now we will look at the correlation of all the attributes with the expected class attribute (median-house-value)
corr_matrix = data.corr() # computes the standard correlation coefficient (Pearson’s r) between every pair of attributes

In [None]:
numOfAccidentsPerCity[:20].plot(kind = 'barh')

In [None]:
Weathertypes[:20].plot(kind = 'barh')

In [None]:
corr_matrix["Severity"].sort_values(ascending=False)
# Most correlated attributes to Severity

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["Severity"]
scatter_matrix(data[attributes], figsize=(12, 8))
# Another way to check for correlation between attributes is to use the pandas scatter_matrix() function above
# Here we choose to plot 4 promising  numerical attributes against each other


In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x="Severity", y="Humidity(%)", data=data)
plt.ylabel('Humidity(%)', fontsize=12)
plt.xlabel('Severity', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x="Severity", y="Temperature(F)", data=data)
plt.ylabel('Temperature(Farenheit)', fontsize=10)
plt.xlabel('Severity', fontsize=10)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x="Severity", y="Wind_Chill(F)", data=data)
plt.ylabel('Wind_Chill(F)', fontsize=10)
plt.xlabel('Severity', fontsize=10)
plt.xticks(rotation='vertical')
plt.show()

# Experimenting with Attribute Combinations

In [None]:
# Here you will generate new features. This is what we call feature engineering
# Q8 What are the new features that you are generating? Do they make sense ?

data["latitude_diff"] = data["End_Lat"]-data["Start_Lat"]

#Calculate total time of accident
data["Total_Time(hrs)"] = pd.to_datetime(data['End_Time'])-pd.to_datetime(data['Start_Time'])
data['Total_Time(hrs)'] = pd.to_timedelta(data['Total_Time(hrs)'])
data['Total_Time(hrs)']=data['Total_Time(hrs)'].dt.total_seconds()/3600  
data.head()

In [None]:
#Q10 Plot the scatter plot of the rooms_per_household against median_house_value
#Q11 Now use the housing describe method to view the statistical summary of the dataset 

# Prepare the Data for Machine Learning Algorithms

## Data Cleaning
Dealing with missing features


1. Get rid of the corresponding districts. `dropna()`
2. Get rid of the whole attribute. `drop()` 
3. Set the values to some value (zero, the mean, the median, etc.)  `fillna()`


In [None]:
sample_incomplete_rows = data[data.isnull().any(axis=1)].head()
sample_incomplete_rows   # display rows with missing values


In [None]:
# option 1 Remove the records (i.e rows ) with missing values.
sample_incomplete_rows.dropna(subset=["total_bedrooms"])  
#Q12 What is displayed below? Note your observation 

In [None]:
sample_incomplete_rows.drop("total_bedrooms", axis=1)        # option 2 Remove the entire attribute with the missing value
#Q13  What do you see now? How is this different from your observation in Q12


In [None]:
median = housing["total_bedrooms"].median() # calculate the median of total_bedrooms
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3 Fill missing value with  median
# Q14 Note your observation in the displayed rows below. How is it different from Q12 and Q13


In [None]:
sample_incomplete_rows

## Scikit-Learn solution (optional)


In [None]:
from sklearn.impute import SimpleImputer      # Look up SimpleImputer Class in Sklearn Documenation
imputer = SimpleImputer(strategy="median")    # Replace each missing attribute with median of that attribute 

In [None]:
#We cant compute the median of a categorical data, therefore we are creating a copy of the data without ocean_prox
housing_num = housing.drop("ocean_proximity", axis=1) 

In [None]:
imputer.fit(housing_num)

The imputer has simply computed the median of each attribute and stored the result in its `statistics_` instance variable. 

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values #Check that this is the same as manually computing the median of each attribute


In [None]:
X = imputer.transform(housing_num) # transform the data set

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index) # load the new tranformed data set into the pandas frame housing_tr

In [None]:
housing_tr.loc[sample_incomplete_rows.index.values] # show the rows with previously missing values 

### Handling Text and Categorical Attributes

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)  # look at the value of  'ocean_proximity' for the first 10 instances 

In [None]:
housing_cat.tail(10)
# look at the value of  'ocean_proximity' for the last 10 instances

In [None]:
from sklearn.preprocessing import OrdinalEncoder
# sklearn OrdinalEncoder class is used to convert categorical values to numbers
ordinal_encoder = OrdinalEncoder() 
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10] # Lists the numerical values that correspond to the categorical attribute
# Q15 Why can representing a catogorical variable with numbers be a problem in ML?


In [None]:
# 1 D array of categories for the attribute 'ocean-proximity'
ordinal_encoder.categories_ # prints the categories for all categorical attributes , here we only have one categorical attribute


In [None]:
# An alternative way to represent a categorical attribuite is to use 'one-hot-encoding'

from sklearn.preprocessing import OneHotEncoder  
# OneHotEncoder class converts categorical values into one-hot vectors , this assumes no order so better for categorical variables

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
type(housing_cat_1hot) # This is a SciPy sparce matrix ( not a NumPy array)
# a sparse matrix only stores the location of the non‐zero enties , therefore saves memory

In [None]:
housing_cat_1hot.toarray() # To convert it to a (dense) NumPy array, call toarrray() method

In [None]:
cat_encoder = OneHotEncoder(sparse=False) # Alternatively, you can set sparse=False when creating the OneHotEncoder
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
cat_encoder.categories_         # Get a list of categories

### Feature Scaling (optional)




In week 2 lectures you have learned about two common ways to get all attributes to have the same scale: min-max scaling and standardization (Zscore).

Sklearn provides two transformer functions for this 

In [None]:
#Q16 Research sklearn documentation for the functions MinMaxScaler() and StandardScaler() and try to experiment with it on the housing data set