In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Data.csv', encoding="utf-8")  
df

## Content

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
plot_columns = ['total_value' , 'area' , 'year' , 'deposit' , 'rent']

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, len(plot_columns), figsize=(30, 7))  # Adjust the figsize as needed

# Iterate through selected columns and create histograms
for i, column in enumerate(plot_columns):
    df[column].plot(kind='hist', ax=axes[i], title=column )
    axes[i].set_xlabel(column)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
plot_columns = ['elavator' , 'parking' , 'warehouse']

# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(1, len(plot_columns), figsize=(20, 8))  # Adjust the figsize as needed

# Iterate through selected columns and create histograms
for i, column in enumerate(plot_columns):
    df[column].plot(kind='hist', ax=axes[i], title=column )
    axes[i].set_xlabel(column)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
np.unique(df['warehouse'] , return_counts = True )

## Preprocessing for model

In [None]:
cat_col = ['elavator' , 'parking' , 'warehouse' , 'neighborhood']
cat_col_df = df[cat_col]
df1 = df.drop(columns = cat_col)
df1

In [None]:
one_hot_encoded = pd.get_dummies(cat_col_df, columns=cat_col)
one_hot_encoded

In [None]:
df2 = pd.concat([df1 , one_hot_encoded] , axis = 1)
df2

## Train Test Split

In [None]:
df3 = np.reshape(df2 , (len(df2), -1))
df3

In [None]:
X = df3.drop(columns = ['total_value'])
y = df3['total_value']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25 , random_state = 2 )

## Modeling

In [None]:
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',n_estimators=2000, early_stopping_rounds=50, objective='reg:linear', max_depth=3, learning_rate=0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

## Validation

In [1]:
y_pre_te = reg.predict(X_test)
y_pre_tr = reg.predict(X_train)

NameError: name 'reg' is not defined

In [None]:
print('R2 Score for Test data = ' , r2_score(y_test , y_pre_te))
print('R2 Score for Train data = ' , r2_score(y_train, y_pre_tr))

In [None]:
plt.subplot(211)
plt.plot(y_pre_tr, 'b.')
plt.plot(y_train, 'r.')
plt.legend(['predict' , 'actual'])
plt.title('Accuracy for train data')
plt.subplot(212)
plt.plot(y_pre_te, 'b.')
plt.plot(y_test, 'r.')
plt.show()