In [None]:
# The first subset is known as the training data - it’s a portion of our actual dataset that is fed into the machine learning model to discover and learn patterns. In this way, it trains our model. 

# Once your machine learning model is built (with your training data), you need unseen data to test your model. This data is called testing data, and you can use it to evaluate the performance and progress of your algorithms’ training and adjust or optimize it for improved results. 
# Testing data has two main criteria. It should:
# -Represent the actual dataset 
# -Be large enough to generate meaningful predictions


import pandas as pd
# Pandas is an open-source Python package used for data analysis and manipulation tasks. It is built on top of Numpy, which provides support for multi-dimensional arrays.

housing=pd.read_csv("./data.csv")
# housing.head() # this will display the first 5 rows of the data in table format
# housing.info() # this will give the information about the data such as total number of entries

# housing['CHAS'].value_counts() # will give the count of all the possible values of the given feature

housing.describe() # will give the details of eah feature such as std, mean, min, count, max, etc.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

housing.hist(bins=50,figsize=(20,20))

## Train Test Splitting

In [None]:
# This is for learning how the function works internally
# import numpy as np

# def split_train_test(data,test_ratio):
#     np.random.seed(42) #

#     shuffled=np.random.permutation(len(data))
#     test_set_size=int(len(data)* test_ratio)
#     test_indices=shuffled[:test_set_size]
#     train_indices=shuffled[test_set_size:]
#     return data.iloc[test_indices], data.iloc[train_indices]

In [None]:
# test_set,train_set=split_train_test(housing,0.2)

# print(f"Rows in training set: {len(train_set)}\n Rows in test set: {len(test_set)}")

In [None]:
from sklearn.model_selection import  train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)

print(f"Rows in training set: {len(train_set)}\n Rows in test set: {len(test_set)}")

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index,test_index in split.split(housing,housing['CHAS']):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

## Looking for correlations

In [None]:
corr_matrix=housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False) # calculating the correlation of money wrt all other features

# pearson correlation coeff 1 means strong positive correlation 
# when this value increases the price of the property will increase

In [None]:
from pandas.plotting import scatter_matrix
attributes=['MEDV','RM','ZN','LSTAT']

scatter_matrix(housing[attributes],figsize=(10,10))

In [None]:
housing.plot(kind='scatter', x='RM', y='MEDV', alpha=0.9)

## Trying out attribute combinations