# House pricing prediction

In [1]:
#biblotek 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

# Begynner med å importere dataen.

In [3]:
trainingData=pd.read_csv('train.csv')
trainingData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
testData = pd.read_csv('test.csv')
testData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# Begynner med å ryddde opp i dataen

In [6]:
trainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Dropper kolonner der nullverdiene utgjør mer en 50% av alle objektene.
Så foreks kolonner som alley, PoolQc, Fence, og MiscFeature har veldig mangen null-verdier osm vil gå utover 
resultatet

In [7]:
trainingData.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature','MasVnrType'],axis=1,inplace=True)
trainingData.drop(['Id'],axis=1,inplace=True)

Så må vi erstatte nullverdier i de andre kolonnene. Måten vi erstatter dem kommer litt ann på hvilke type
kolonnen har. Med tall tar vi ofte bare å erstatter null-veridene med gjennomsnittet. Med kategoriske verdier er det kanskje lurere å erstatte dem med den mest hyppige typen.  

Erstatter med gjennomsnitt:

In [8]:
for column in trainingData.columns:
    if trainingData[column].dtype in ['int64', 'float64']:
        column_mean = trainingData[column].mean()
        trainingData[column].fillna(column_mean, inplace=True)

Erstatter med hyppigeste type

In [9]:
for column in trainingData.columns:
    if trainingData[column].dtype == 'object':
        trainingData[column].fillna(trainingData[column].mode()[0], inplace=True)

In [10]:
trainingData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

Da har vi blitt kvitt alle null-verdier trainingData. Nå må vi gjøre det samme for testData

In [11]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [12]:
testData.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature','MasVnrType'],axis=1,inplace=True)
testData.drop(['Id'],axis=1,inplace=True)

In [13]:
for column in testData.columns:
    if testData[column].dtype in ['int64', 'float64']:
        column_mean = testData[column].mean()
        testData[column].fillna(column_mean, inplace=True)

In [14]:
for column in testData.columns:
    if testData[column].dtype == 'object':
        testData[column].fillna(testData[column].mode()[0], inplace=True)

In [15]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1459 non-null   object 
 2   LotFrontage    1459 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   LotShape       1459 non-null   object 
 6   LandContour    1459 non-null   object 
 7   Utilities      1459 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Condition2     1459 non-null   object 
 13  BldgType       1459 non-null   object 
 14  HouseStyle     1459 non-null   object 
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

Vi må håndtere alle kategoriske verdier. Dette er fordi maskinlærings modeller bruker ofte numeriske verdier. 
Lager så en metode som gjør dette med alle kolonnene som har kategoriske verdier. 

In [16]:
def dummyGen(multcolumns):
    df_final=df
    i=0
    for fields in multcolumns:
        
        
        df1=pd.get_dummies(df[fields],drop_first=True)
        
        df.drop([fields],axis=1,inplace=True)
        if i==0:
            df_final=df1.copy()
        else:
            
            df_final=pd.concat([df_final,df1],axis=1)
        i=i+1
       
        
    df_final=pd.concat([df,df_final],axis=1)
        
    return df_final

In [17]:
kolonner = trainingData.select_dtypes(include=['object', 'category']).columns.tolist()


38

In [18]:
testData.shape

(1459, 74)

Vi vil unngå at test og data har ulike antall kategorier, det vil si at ikke alle kategorier av en type kommer opp. Det er viktig at test og training data har like verdier slik at modellen blir trent best mulig opp. Bruker da en concat å lage en ny tabell der trening og test data deler antall kolonner.

In [19]:
df=pd.concat([trainingData, testData],axis=0)

In [20]:
df=dummyGen(kolonner)

MSZoning
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
SaleType
SaleCondition


Fjerner duplikater

In [21]:
df = df.loc[:,~df.columns.duplicated()]
df.shape

(2919, 176)

In [22]:
df_Train=df.iloc[:1460,:]
df_Test=df.iloc[1460:,:]
df_Test.shape


(1459, 176)

In [23]:
df_Test.drop(['SalePrice'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Test.drop(['SalePrice'],axis=1,inplace=True)


Oppsummring: Da har vi ryddet i både training og test settet. fjernet NaN-verdier,laget dummies av kategoriske verdier. Vi har brukt concat for å få samme antall kategorier i test og training. Og så splittet det opp igjen. 

Nå gjenstår det å velge algorithme og modell. Så kan gi den df_Train, trene den og så utføre modellen på df_Test data og få predikasjoner. 
Jeg velger å bruke XGboost.


Splitter opp df_train i X og y. 

In [24]:
X_train = df_Train.drop(['SalePrice'],axis=1)
y_train = df_Train['SalePrice']

In [25]:
!pip install xgboost



Som sagt har jeg tenkt til å bruke XGBboost algoritmen. For å optimalisere algoritmen vil gjøre jeg litt hyperparameter tuning.
Det vil si å finne dem parameterne som retunerer det beste resultatet. 

In [26]:
import xgboost
regressor=xgboost.XGBRegressor()

In [27]:
from scipy.stats import randint, uniform

# Definerer hvilke parametere vi vil endre og gjøre bedre.
hyperparameter_grid = {
    'n_estimators': randint(100, 1500),  
    'max_depth': randint(2, 15),  
    'learning_rate': uniform(0.05, 0.2),  
    'subsample': uniform(0.5, 0.5),  
    'min_child_weight': randint(1, 4),  
    'gamma': uniform(0, 1)  
}

In [28]:
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [29]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


random_cv blir trent med X_train og y_train. Så retunerer den en model med de beste estimatorene. Den lagrer vi
variabelen best_model

In [30]:
best_model = random_cv.best_estimator_

Så trener vi den modelen med X_train og y_train 

In [31]:
best_model.fit(X_train,y_train)

Så kjører vi listen med modellen vi har trent og får resultatene

In [32]:
y_pred = best_model.predict(df_Test)

In [33]:
y_pred

array([120716.016, 158398.17 , 191163.56 , ..., 180101.25 , 115614.336,
       233217.98 ], dtype=float32)

Så oppretter vi en csv fil med Id nr og SalePrice siden det er den vi har trent modellen til å retunere. 

In [34]:
pred = pd.DataFrame(y_pred)
sub_df=pd.read_csv('sample_submission.csv')
datasets=pd.concat([sub_df['Id'], pred],axis=1)
datasets.columns=['Id', 'SalePrice']
datasets.to_csv('sample_submission.csv',index=False)


[CV 1/5] END gamma=0.3745401188473625, learning_rate=0.24014286128198326, max_depth=12, min_child_weight=1, n_estimators=1144, subsample=0.5780093202212182;, score=(train=-8.251, test=-17206.340) total time=   3.3s
[CV 3/5] END gamma=0.15599452033620265, learning_rate=0.061616722433639894, max_depth=9, min_child_weight=1, n_estimators=1223, subsample=0.5714334089609704;, score=(train=-0.171, test=-16572.982) total time=   5.0s
[CV 2/5] END gamma=0.6508884729488529, learning_rate=0.061282315805420054, max_depth=9, min_child_weight=2, n_estimators=905, subsample=0.5003893829205072;, score=(train=-81.072, test=-17704.979) total time=   2.4s
[CV 1/5] END gamma=0.9922115592912175, learning_rate=0.17349630192554333, max_depth=11, min_child_weight=2, n_estimators=352, subsample=0.7159725093210578;, score=(train=-9.641, test=-16674.725) total time=   1.3s
[CV 2/5] END gamma=0.9922115592912175, learning_rate=0.17349630192554333, max_depth=11, min_child_weight=2, n_estimators=352, subsample=0.71

[CV 4/5] END gamma=0.3745401188473625, learning_rate=0.24014286128198326, max_depth=12, min_child_weight=1, n_estimators=1144, subsample=0.5780093202212182;, score=(train=-8.526, test=-16508.045) total time=   3.0s
[CV 1/5] END gamma=0.15599452033620265, learning_rate=0.061616722433639894, max_depth=9, min_child_weight=1, n_estimators=1223, subsample=0.5714334089609704;, score=(train=-8.041, test=-16033.350) total time=   4.7s
[CV 1/5] END gamma=0.6508884729488529, learning_rate=0.061282315805420054, max_depth=9, min_child_weight=2, n_estimators=905, subsample=0.5003893829205072;, score=(train=-59.775, test=-15802.380) total time=   2.5s
[CV 4/5] END gamma=0.6508884729488529, learning_rate=0.061282315805420054, max_depth=9, min_child_weight=2, n_estimators=905, subsample=0.5003893829205072;, score=(train=-81.554, test=-13678.064) total time=   2.4s
[CV 4/5] END gamma=0.9922115592912175, learning_rate=0.17349630192554333, max_depth=11, min_child_weight=2, n_estimators=352, subsample=0.7

[CV 2/5] END gamma=0.3745401188473625, learning_rate=0.24014286128198326, max_depth=12, min_child_weight=1, n_estimators=1144, subsample=0.5780093202212182;, score=(train=-8.496, test=-18184.567) total time=   3.1s
[CV 2/5] END gamma=0.15599452033620265, learning_rate=0.061616722433639894, max_depth=9, min_child_weight=1, n_estimators=1223, subsample=0.5714334089609704;, score=(train=-8.764, test=-17313.866) total time=   4.6s
[CV 5/5] END gamma=0.15599452033620265, learning_rate=0.061616722433639894, max_depth=9, min_child_weight=1, n_estimators=1223, subsample=0.5714334089609704;, score=(train=-0.161, test=-16460.813) total time=   4.8s
[CV 3/5] END gamma=0.9922115592912175, learning_rate=0.17349630192554333, max_depth=11, min_child_weight=2, n_estimators=352, subsample=0.7159725093210578;, score=(train=-1.640, test=-18242.181) total time=   1.3s
[CV 2/5] END gamma=0.2912291401980419, learning_rate=0.17237057894447588, max_depth=11, min_child_weight=3, n_estimators=289, subsample=0.5

In [57]:
import gradio as gr

def create_dummies(df, columns):
    for column in columns:
        if column in df.columns:
            dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
            df = pd.concat([df, dummies], axis=1)
            df.drop([column], axis=1, inplace=True)
    return df

# Funksjon for å gjøre prediksjoner
def predict_price(input_file):
    # Les inn data fra CSV-filen
    input_df = pd.read_csv(input_file.name)  # Tilgang til filnavnet fra filobjektet
    
    
    # Bruk modellen til å gjøre prediksjon
    prediction = best_model.predict(input_df)
    return prediction[0]

# Opprett Gradio-grensesnittet med en inndatakomponent for filopplasting
iface = gr.Interface(
    fn=predict_price,
    inputs=gr.File(label="Last opp CSV-fil", type="file"),  # Liste over gyldige filtyper
    outputs="text"  # Du kan endre dette basert på prediksjonens output-format
)

# Start Gradio-grensesnittet
iface.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.


