In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [59]:
HouseDF = pd.read_csv('../input/usa-housing/USA_Housing.csv')
HouseDF.head() 

In [60]:
HouseDF.shape
#dataset has 5000 rows and 7 columns

# Rounding off the necessary values to get a clean data

In [61]:
HouseDF['Avg. Area Number of Rooms'] = HouseDF['Avg. Area Number of Rooms'].apply(np.ceil)
HouseDF['Avg. Area Number of Bedrooms'] = HouseDF['Avg. Area Number of Bedrooms'].apply(np.ceil)
HouseDF['Area Population'] = HouseDF['Area Population'].apply(np.ceil)
HouseDF['Avg. Area Income'] = HouseDF['Avg. Area Income'].apply(np.ceil)

HouseDF.head() 

# Pairplot helps to visualize relation between multiple datapoints

In [62]:
sns.pairplot(data=HouseDF,diag_kind='kde')  

# displaying subpairplots


In [63]:
sns.pairplot(data=HouseDF,vars=["Avg. Area Number of Bedrooms","Price"],kind='scatter',diag_kind='kde' )  

In [64]:
sns.distplot(HouseDF['Price'])  #This shows that most of the houses are in the price range of 1 to 1.5 million us dollars

# Model Building

In [65]:
# X consists of all the features to train the model to predict the target value which is the price 
# hence Y is made the target part to test the prediction later on after using it to train the model
X = HouseDF[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
               'Avg. Area Number of Bedrooms', 'Area Population']]

y = HouseDF['Price']

# Splitting the dataset into test and train  

In [66]:
# the split is choosen as 90/10
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101) 

# Creating and training the model

In [67]:
from sklearn.linear_model import LinearRegression 

linear_regr_model = LinearRegression() 

model=linear_regr_model.fit(X_train,y_train) 

# Predictions from the model

In [68]:
coeff_df = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_df


# Conclusions from the prediction

Keeping all the other features fixed,

An increase of 1 unit in Avg. Area Income is associated with an  increase of $21.52 on the price of the real estate .

An increase of 1 unit in Avg. Area House Age is associated with an increase of $164883.28 on the price of the real estate .

An increase of 1 unit in Avg. Area Number of Rooms is associated with an increase of $122368.67 on the price of the real estate .

An increase of 1 unit in Avg. Area Number of Bedrooms is associated with an increase of $2233.80 on the price of the real estate  .

An increase of 1 unit in Area Population is associated with an increase of $15.15 on the price of the real estate  .

# Testing the predicted values

In [69]:
#blue squares represent actual data and green squares represent predicted data
predictions = model.predict(X_test) 
plt.plot(y_test, predictions, 'bs') # plotting t, a separately 
plt.plot(predictions,y_test, 'gs') # plotting t, b separately 

plt.show()        