In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'housedata:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F46927%2F85203%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240317%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240317T052339Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da5e900f37c90217fb66f5d59c82b14930ca1e0ba7ba3bef49ee8fefc854b435d5c4438e1c356c148a3877492872582632d14d417109e33138ef8d2bf539085bbffa56578a2ad6e563ef0a519ff83cef36f26002ee4742a74f91ef83dc31fb1068d688e0cae6b0f853fb8a005bdcae2836436a02d4c8a6f9974d4a8c79ec174e9d04bb08aaabb9f336fa4270b584ac0bf8461b405afc24dcb888af2ae2125ae7f91b70fa4bcc36f8620f09cb363cf94f62095f90cae334527801f6bd41ad4e8fe719dac4d8118a291109cc7df0ac7bb9770e64735d9a2e78469bf46d7930e450171df5714987e046686db18e869d1942c6c792ca50892b409cc40aca3a6e0b5f4'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
df = pd.read_csv('/content/data.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
for each in df.columns:
    print(df[each].dtype)


In [None]:
import seaborn as sns

In [None]:
df.columns.to_list()

In [None]:
dataprice = df[df['price'] > 10000000]
dataprice['price']

In [None]:
df[df['price']<1965221]['price'].value_counts()

In [None]:
sns.histplot(data = df[df['price']<1965221]['price'], kde = True)

In [None]:
df1 = df[df['price']<1965221]

In [None]:
df2 = df1.dropna(subset=['price'])

In [None]:
#sns.histplot( x = df2.bedrooms.value_counts().index,y =df2.bedrooms.value_counts().values ,bins = 9)
sns.barplot( x = df2.bedrooms.value_counts().index,y =df2.bedrooms.value_counts().values )
df2.bedrooms.value_counts()

In [None]:
df2.bedrooms = df2.bedrooms.fillna(df2.bedrooms.median())
df2.bedrooms.median()

In [None]:
df2.info()

In [None]:
cat_cols = [col for col in df2.columns if df2[col].dtypes == "O"]
cat_cols

# # lets check how many unique values in those

In [None]:
for each in cat_cols:
    print(len(df2[each].unique()),each)

# lets see how many cols are seem numerical but in fact cat

In [None]:
#tresh
potnumericcol = [col for col in df2.columns if col not in cat_cols]
potnumericcol

# lets see how many cols are seem numerical but in fact cat

In [None]:
treshold = 6
numbutcat = [col for col in potnumericcol if len(df2[col].unique()) < treshold]
numbutcat

In [None]:
for each in numbutcat:
    print(df2[each].unique(), each )

# now lets see how they effect price based on their mean prices on a barplot

In [None]:
sns.barplot( x =df2['waterfront'] ,y =df2.groupby('waterfront')['price'].mean() )
df2[df2['waterfront']==1]['price'].mean()

# why the fuck it does not work?

In [None]:
print(df2[df2['waterfront'] == 1]['price'].head(100))

In [None]:
df2.groupby('waterfront')['price'].mean().plot(kind="bar", stacked=True)


Encoding for waterfront seems good

In [None]:
df2.groupby('view')['price'].mean().plot(kind="bar", stacked=True)

for this I can use k fold target encoding

In [None]:
df2.groupby('condition')['price'].mean().plot(kind="bar", stacked=True)

also good candidate for encoding is k fold target encoding

In [None]:
df.yr_renovated.value_counts()

there are a lot of 0 values it probably means they are not renovated lets check

In [None]:
df2[df2['yr_renovated']==0]['price'].mean()

In [None]:
df2[df2['yr_renovated']!=0]['price'].mean()

In [None]:
df2.head(10)

it seems like they did not need renovation so their price could be a bit higher

In [None]:
meannotrenovated= df2[df2['yr_renovated']==0]['yr_built'].mean()

In [None]:
meanrenovated = df2[df2['yr_renovated']!=0]['yr_built'].mean()

so as expected not renovated ones are overall newer than those which are renovated

apparently I need to change values which have 0 in their 'yr_built' column with current year + difference between those means so that I wont get unusual result when I will subtract them from current year

In [None]:
import datetime

this_year = datetime.datetime.now().year
df2['yr_renovated'] = df2['yr_renovated'].apply(lambda x: this_year+(meannotrenovated-meanrenovated) if x == 0 else x)

lets change this into more simple term so like decreasing from current year

In [None]:
df2['yr_renovated'] = df2['yr_renovated'].apply(lambda x: x-this_year)

In [None]:
df2.groupby('yr_built').size()

In [None]:
df2['yr_built'] = df2['yr_built'].apply(lambda x : this_year-x)

In [None]:
df2.head()

In [None]:
for each in df2.columns:
    print(each, len(df2[each].unique()))

# Now lets figure out what to do with Streets because 4425 different name is very high for one hot encoding and might be unnecessary for target encoding so maybe grouping them then encoding them could be a good idea!

In [None]:
quantile_list = [0, 0.20, 0.40, 0.60, 0.80, 1]
quantile_list_price = []
for each in quantile_list:
    print(df2['price'].quantile(each))
    quantile_list_price.append(df2['price'].quantile(each))


In [None]:
my_dict = dict(zip(quantile_list, quantile_list_price))

In [None]:
my_dict

In [None]:
for key, values in my_dict.items():
    print(key,values)

In [None]:
import numpy as np

conditions = [
    (df2['price'] < 7800),
    (7800<= df2['price']) & (df2['price'] < 300000),
    (300000 <= df2['price']) & (df2['price'] < 405020),
    (405020 <= df2['price']) & (df2['price'] < 530000),
    (530000 <= df2['price']) & (df2['price'] < 702700),
    (df2['price'] >= 702700)
]

values = [0.1, 0.2, 0.3, 0.4, 0.6, 1.0]

df2['street_labels'] = np.select(conditions, values, default=np.nan)

In [None]:
#df2['street_labels'] = [f"Q{i}" for i in pd.qcut(df2['price'], quantile_list, labels=False, duplicates='drop')]

In [None]:
common_streetnames = df2.groupby('street_labels')['street'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)

In [None]:
common_streetnames

In [None]:
filtered_df = df2[(df2['street'] == '1610 N 185th St')]
filtered_df

In [None]:
df2['street'].value_counts()

In [None]:
street_dict = {'0.1': [], '0.2': [], '0.3': [], '0.4': [], '0.6': [], '1.0': []}

for street in df2['street'].to_list():
    if pd.notna(street):
        mask = df2['street'] == street  # Check if 'street' column matches the given street name
        matching_street_labels = df2.loc[mask, 'street_labels']

        # Convert matching_street_labels to string and use it as a key
        key = str(matching_street_labels.iloc[0]) if not matching_street_labels.empty else None

        if key in street_dict:
            street_dict[key].append(street)



In [None]:
my_dict = {'key1': [1, 2, 3], 'key2': [4, 5, 6], 'key3': [7, 8, 9]}

search_value = 5
found_keys = []

# Iterate through dictionary items
for key, value_list in my_dict.items():
    # Check if the search_value is in the current list
    if search_value in value_list:
        found_keys.append(key)

# Display the keys where the value was found
print(f"Search value {search_value} found in the following keys: {found_keys}")

In [None]:
df2.loc[df2['street_labels']==0.2,'street_labels']

# now lets engage with cities

In [None]:
stdvalforcity = df2.groupby('city')['price'].std()
df2.groupby('city')['price'].mean().sort_values().plot(kind='bar',yerr=stdvalforcity, capsize=5, stacked=True,figsize=(30, 15))


In [None]:
df2[df2['city']=='Inglewood-Finn Hill']

In [None]:
cities = df2.groupby('city')['city'].unique()
citiesl = []
for each in cities:
    citiesl.append(each)


In [None]:
city_stats = df.groupby('city')['price'].agg(['mean', 'std']).reset_index() # reset index was neccesary?
city_stats['city'] = citiesl


In [None]:
city_stats['city'] = city_stats['city'].astype(str)

In [None]:
city_stats['city'] = city_stats['city'].apply(lambda x: x.strip('[]').replace("'", '') )

In [None]:
df2 = pd.merge(df2, city_stats, on ='city', how ='left',suffixes=('', '_stats'))

In [None]:
#scaling_factor = 0.01  # Adjust as needed
#scaled_lower_bound = df2['mean'] - df2['std'] * scaling_factor
#scaled_upper_bound = df2['mean'] + df2['std'] * scaling_factor

df2['fluctuatingmeanbycity'] = np.random.normal(loc=df2['mean'], scale=df2['std'], size=len(df2))

In [None]:
df2 = df2.rename(columns={'mean': 'meanbycity', 'std': 'stdbycity'})

# City also kind of dealt with now I just need to remove other columns

In [None]:
df2['statezip'].value_counts()


In [None]:
stdvalforcity = df2.groupby('statezip')['price'].std()
df2.groupby('statezip')['price'].mean().sort_values().plot(kind='bar',yerr=stdvalforcity, capsize=5, stacked=True,figsize=(30, 15))
#plt.xlabel('Statezip')
#plt.ylabel('Mean Value')
#plt.title('Mean Values by Statezip')

In [None]:
Statezips = df2.groupby('statezip')['statezip'].unique()
Statezipsl = []
for each in Statezips:
    Statezipsl.append(each)
Statezip_stats = df.groupby('statezip')['price'].agg(['mean', 'std']).reset_index() # reset index was neccesary?
Statezip_stats['statezip_names'] = Statezipsl
df2 = pd.merge(df2, Statezip_stats, on ='statezip', how ='left',suffixes=('', '_stats'))
df2['fluctuatingmeanbystatezip'] = np.random.normal(loc=df2['mean'], scale=df2['std'], size=len(df2))

In [None]:
df2 = df2.rename(columns={'mean': 'meanbyzipcode', 'std': 'stdzipcode'})

# Don't forget to write a function to encode new data as test data!!!!!

In [None]:
df2['date'].value_counts()

In [None]:
#for each in cat_cols:
#for each in df2.columns:
#    print(df2[each].value_counts())
#    print(df2[each].nunique(),each)
#    print('this is value_counts for ', '!!!!', each)

In [None]:
df2.condition.value_counts()

In [None]:
# country is useless
df2.drop(['country'], inplace =True, axis =1)

In [None]:
df2.shape


In [None]:
df2['date'].tail(100)

In [None]:
#df2.loc[0,'date'].day_name()
df2['date']= pd.to_datetime(df2['date'], format= '%Y-%m-%d %X')

In [None]:
#df2.loc[0,'date'].day_name()
df2['Dayofweek']= df2['date'].dt.day_name()

In [None]:
df2['date'].max()-df2['date'].min()

In [None]:
#df2.set_index('date')

In [None]:
df2.groupby('Dayofweek')['price'].mean().plot(kind="bar", stacked=True)


In [None]:
#sumbydays = df2.groupby('Dayofweek')['price'].sum()
sumbydays = df2.groupby('Dayofweek')['price'].mean()
print(sumbydays)


In [None]:
df2.groupby('Dayofweek').size()

In [None]:
from scipy.stats import norm
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt

In [None]:

weekdaymean = df2.loc[df2['Dayofweek'].isin(['Friday',
'Monday',
'Thursday',
'Tuesday',
'Wednesday'])]['price'].mean()

weekdaylen = len(df2.loc[df2['Dayofweek'].isin(['Friday' ,'Monday','Thursday'
                                                ,'Tuesday','Wednesday'] )])

weekdaystd = df2.loc[df2['Dayofweek'].isin(['Friday',
'Monday',
'Thursday',
'Tuesday',
'Wednesday'])]['price'].std()

weekdaylen

In [None]:
weekdaymean

In [None]:
#'Saturday'    'Sunday'
weekendmean = df2.loc[df2['Dayofweek'].isin(['Saturday','Sunday'])]['price'].mean()
weekendstd = df2.loc[df2['Dayofweek'].isin(['Saturday','Sunday'])]['price'].std()
weekendmean/ weekendstd

In [None]:
np.sqrt(np.square(weekendstd)+np.square(weekdaystd))

In [None]:
Zscore = (weekendmean - weekdaymean )/np.sqrt(np.square(weekendstd)+np.square(weekdaystd))
Zscore

wednesdaymean = df2.loc[df2['Dayofweek']=='Wednesday','price'].mean()
saturdaymean = df2.loc[df2['Dayofweek']=='Saturday','price'].mean()


two_samp_zstat,two_samp_pval = proportions_ztest(count=[saturdaymean, wednesdaymean],
                                      nobs=[len(df2.loc[df2['Dayofweek']=='Saturday']),
                                            
                                           len(df2.loc[df2['Dayofweek']=='Wednesday'])
                                           ])
two_samp_zstat,two_samp_pval

Why it did not work?

> Now will try to see in bayesian approach whether is there any significant difference between days and price

In [None]:
# for sampling
df2day = df2.loc[df2['Dayofweek'].isin(['Friday' ,'Monday','Thursday'
                                                ,'Tuesday','Wednesday'] )]
df2end =df2.loc[df2['Dayofweek'].isin(['Saturday','Sunday'])]

In [None]:
sample_mean_weekday = df2day.sample(frac =1 , replace=True)['price'].mean()
sample_mean_weekday

In [None]:
sample_mean_weekendList =[]
sample_mean_weekdayList =[]
iteration = 1000
for each in range(iteration):
    sample_mean_weekend = df2end.sample(frac =1 , replace=True)['price'].mean()
    sample_mean_weekendList.append(sample_mean_weekend)


    sample_mean_weekday = df2day.sample(frac =1 , replace=True)['price'].mean()
    sample_mean_weekdayList.append(sample_mean_weekday)

sample_mean_weekenddf = pd.DataFrame(sample_mean_weekendList)
sample_mean_weekdaydf = pd.DataFrame(sample_mean_weekdayList)

In [None]:
sample_mean_weekdaydf.head()

In [None]:
sample_mean_weekenddf.head()

In [None]:
result = pd.concat([sample_mean_weekenddf,sample_mean_weekdaydf],axis = 1)
result.head()
result.columns = ["weekend", "weekdays"]
result.head()

In [None]:

result['diff'] = (result['weekend']-result['weekdays'])/result['weekend']*100
result['diff'].plot.kde()


In [None]:
prob = (result['diff']>0).sum()/len(result)
prob

# soo there is less than 5 percent chance there is no difference between weekdays or weekend. So opening a new column according to weekend or weekdays makes sense

In [None]:
df2.head(10)

In [None]:
weekdays = ['Friday' ,'Monday','Thursday','Tuesday','Wednesday']
weekend = ['Saturday','Sunday']
def weekendornot(x):
    if x in weekdays:
        return 1
    else:
        return 1.5

df2['weekdayorweekend']=df2['Dayofweek'].apply(weekendornot)

In [None]:
df2['month'] = df2['date'].dt.month.astype(float)

In [None]:


stdval = df2.groupby('month')['price'].std()
ax = df2.groupby('month')['price'].mean().plot(kind='bar', yerr=stdval, capsize=5, stacked=True, color='skyblue', alpha=0.7)
plt.xlabel('Month')
plt.ylabel('Values')
plt.title('Mean and Standard Deviation Bar Chart')


In [None]:
df2.shape

In [None]:
grouped = df2.groupby('month')['price'].agg(['mean', 'std','count'])#.plot(kind = 'bar')
grouped['scaledcount'] = grouped['count']*150
grouped.plot(kind='bar', y=['mean', 'std', 'scaledcount'], figsize=(10, 6), rot=0)

In [None]:
counts_by_month = df2.groupby('month').count()
counts_by_month

In [None]:
sumofpricesbymonth = df2.groupby('month')['price'].sum()
sumofpricesbymonth

So eventhough price went a little higher sold out houses decreased significantly it can be for many different reasons
1. price went higher so people could not buy it
2. out of season to buy houses to instead of waiting for selling houses people increased the price to match next year price?
3. lack of houses
4. lack of people
seems like more data is neccessary to make any conclusion on data so i will ignore how many house being sold in modelling part

# now lets remove encoded columns and encode other categorical columns!!!

In [None]:
columns_to_remove = ['street', 'city', 'statezip', 'statezip_names', 'Dayofweek']
df2 = df2.drop(columns=columns_to_remove)

In [None]:
df2

In [None]:
df2['waterfront'].value_counts()

# Waterfront is okay

# Time to engage with view

In [None]:
df2['view'].astype('str')

In [None]:
df2['view'].value_counts()

In [None]:
my_dict = {}
variable_to_encode = df2['view'].unique().tolist()

# Assign values in a for loop
my_dict = {key: [] for key in variable_to_encode}
for key in variable_to_encode:
    for i in range(len(variable_to_encode)):  # For example, adding three items to each list
        my_dict[key].append(f"{key}")

# Display the resulting dictionary
print(my_dict)

In [None]:
#variable_to_encode = df2['view'].unique().tolist()
#my_dict = {key: [] for key in variable_to_encode}
#all_encoded_val = pd.Series(all_encoded_values).unique().tolist()
#
#for key in my_dict.items:
#    for i in range(len(encoded_values_fold)):  # For example, adding three items to each list
#        my_dict[key].append(f"{key}")
#a = pd.Series(all_encoded_values).unique().tolist()
#
#
#
## Use a list comprehension to create a list with 5 elements, each containing 5 elements
#result_list = [a[i:i+len(variable_to_encode)] for i in range(0, len(a), len(variable_to_encode))]
#
## Display the resulting list
#print(result_list)#

In [None]:
variable_to_encode = df2['view'].unique().tolist()
type(len(variable_to_encode))

# ENCODING BLOCK

In [None]:
import random
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=123, shuffle=True)

variable_to_encode = df2['view'].unique().tolist()                            ### view will be changed

# Define your mean_target_encoding function
def mean_target_encoding(train, test, target, categorical, alpha=5):
    mean_target = train.groupby(categorical)[target].mean()

    train[categorical + '_mean_encoded'] = train[categorical].map(mean_target)
    test[categorical + '_mean_encoded'] = test[categorical].map(mean_target)

    global_mean = train[target].mean()
    train.loc[:, categorical + '_mean_encoded'] = (train[categorical + '_mean_encoded'] * train[categorical].count() + global_mean * alpha) / (train[categorical].count() + alpha)
    test.loc[:, categorical + '_mean_encoded'] = (test[categorical + '_mean_encoded'] * test[categorical].count() + global_mean * alpha) / (test[categorical].count() + alpha)

    return train, test


all_encoded_values = []
# For each fold split
for train_index, test_index in kf.split(df2):
    cv_train, cv_test = df2.iloc[train_index].copy(), df2.iloc[test_index].copy()

    # Assuming 'view' is the categorical column you are encoding
    cv_train, cv_test = mean_target_encoding(train=cv_train,
                                             test=cv_test,
                                             target='price',
                                             categorical='view',                             ### view will be changed
                                             alpha=5)

    # Collect the mean encoded values for the test set
    encoded_values_fold = cv_train['view_mean_encoded'].unique()                           ### view will be changed




    # Add the values to the container
    all_encoded_values.extend(encoded_values_fold)

# Display the unique values collected
print("Unique Encoded Values:", pd.Series(all_encoded_values).unique())
Encoded_values_list = pd.Series(all_encoded_values).unique().tolist()
resulted_list = [Encoded_values_list[i:i+len(variable_to_encode)] for i in range(0, len(Encoded_values_list), len(variable_to_encode))]

result_dict = {key: [values[i] for values in resulted_list] for i, key in enumerate(variable_to_encode)}

cv_train['view_mean_encoded'] = cv_train['view'].apply(lambda x:random.choice(result_dict[x]))                 ### view will be changed also that result dictionary for test data can be created
cv_test['view_mean_encoded'] = cv_test['view'].apply(lambda x:random.choice(result_dict[x]))

In [None]:
df_combined = pd.concat([cv_train, cv_test])
df_combined

In [None]:
kf = KFold(n_splits=5, random_state=123, shuffle=True)

variable_to_encode = df2['condition'].unique().tolist()                            ### view will be changed

# Define your mean_target_encoding function
def mean_target_encoding(train, test, target, categorical, alpha=5):
    mean_target = train.groupby(categorical)[target].mean()

    train[categorical + '_mean_encoded'] = train[categorical].map(mean_target)
    test[categorical + '_mean_encoded'] = test[categorical].map(mean_target)

    global_mean = train[target].mean()
    train.loc[:, categorical + '_mean_encoded'] = (train[categorical + '_mean_encoded'] * train[categorical].count() + global_mean * alpha) / (train[categorical].count() + alpha)
    test.loc[:, categorical + '_mean_encoded'] = (test[categorical + '_mean_encoded'] * test[categorical].count() + global_mean * alpha) / (test[categorical].count() + alpha)

    return train, test


all_encoded_values = []
# For each fold split
for train_index2, test_index2 in kf.split(df_combined):
    cv_train1, cv_test1 = df2.iloc[train_index].copy(), df2.iloc[test_index].copy()


    cv_train1, cv_test1 = mean_target_encoding(train=cv_train1,
                                             test=cv_test1,
                                             target='price',
                                             categorical='condition',               ### view will be changed
                                             alpha=5)

    # Collect the mean encoded values for the test set
    encoded_values_fold = cv_train1['condition_mean_encoded'].unique()                    ### view will be changed




    # Add the values to the container
    all_encoded_values.extend(encoded_values_fold)

# Display the unique values collected
print("Unique Encoded Values:", pd.Series(all_encoded_values).unique())
Encoded_values_list = pd.Series(all_encoded_values).unique().tolist()
resulted_list = [Encoded_values_list[i:i+len(variable_to_encode)] for i in range(0, len(Encoded_values_list), len(variable_to_encode))]

result_dict1 = {key: [values[i] for values in resulted_list] for i, key in enumerate(variable_to_encode)}

cv_train1['condition_mean_encoded'] = cv_train1['condition'].apply(lambda x:random.choice(result_dict1[x]))                 ### view will be changed also that result dictionary for test data can be created
cv_test1['condition_mean_encoded'] = cv_test1['condition'].apply(lambda x:random.choice(result_dict1[x]))

In [None]:
df_combined

In [None]:
df_combined2 = pd.concat([cv_train1, cv_test1])

In [None]:
df_combined2

In [None]:
df_combined2['view_mean_encoded']=df_combined['view_mean_encoded']

In [None]:
df_combined2.dtypes

'maybe in the future I will encode month too'

# below is testing place for big code block

In [None]:
resulted_list
result_dict = {key: [values[i] for values in resulted_list] for i, key in enumerate(variable_to_encode)}
result_dict

In [None]:
#import random
#cv_train['view_mean_encoded'] = cv_train['view'].apply(lambda x:random.choice(result_dict[x]))

exercise to divide list into sublists

In [None]:
cv_train.head(15)

In [None]:
#a = pd.Series(all_encoded_values).unique().tolist()



# Use a list comprehension to create a list with 5 elements, each containing 5 elements
#result_list = [a[i:i+5] for i in range(0, len(a), 5)]

# Display the resulting list
#print(result_list)

In [None]:

df02 = pd.DataFrame({'A': np.arange(11, 21), 'B': list('k0k0k0k0kk')})


df02

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,  shuffle=True)
for train_index, test_index in kf.split(df02):
    print(train_index, test_index)
    print(df02.shape, 'this is cv shape')


# Testing codes are over here

In [None]:
df_combined2['street_labels']

# I will divide data into train test stratified

In [None]:
from sklearn.model_selection import train_test_split
X = df_combined2.drop(columns=['price','date'])
y = df_combined2['price']
stratifying= df_combined2['street_labels']
# Assuming X and y are your features and target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=stratifying)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming you have your training and testing data
# X_train, X_test, y_train, y_test

# Step 2: Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Step 3: Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Step 4: Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train['street_labels'].value_counts()/len(X_train)

In [None]:
X_test['street_labels'].value_counts()/len(X_test)

In [None]:
X_test.isna().sum()

In [None]:
X_train_nadropped = X_train.dropna()

In [None]:
X_test_nadropped = X_test.dropna()

In [None]:
dropped_indicestrain = X_train.index.difference(X_train_nadropped.index)

In [None]:
dropped_indicestest = X_test.index.difference(X_test_nadropped.index)

In [None]:
dropped_indicestrain

In [None]:
dropped_indicestest

In [None]:
X_train_nadropped.isna().sum()

# Modelling

random forrest

In [None]:
# need to remove same indexes that I removed from train and test data since random fucking forrest does not accept 'nan'

In [None]:
y_train_dropped = y_train.drop(dropped_indicestrain)
y_test_dropped = y_test.drop(dropped_indicestest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(
    bootstrap=True,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)
rf.fit(X=X_train_nadropped, y=y_train_dropped)
#from sklearn.model_selection import GridSearchCV
#
##X_train, X_test, y_train, y_test
## Example grid of hyperparameters for grid search
#param_grid = {
#    'n_estimators': [50, 100, 200],
#    'max_depth': [None, 10, 20],
#    'min_samples_split': [2, 5, 10],
#    'min_samples_leaf': [1, 2, 4],
#    'max_features': ['auto', 'sqrt', 'log2'],
#    'bootstrap': [True, False],
#    'random_state': [42],
#}
#
## Create a Random Forest Regressor
#rf = RandomForestRegressor()
#
## Use GridSearchCV to find the optimal hyperparameters
#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
#grid_search.fit(X_train_nadropped, y_train_dropped)
#
## Print the best hyperparameters
#best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)
#Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 42}

In [None]:
rftrainpredictions = rf.predict(X_train_nadropped)

In [None]:
rfpredictions = rf.predict(X_test_nadropped)


In [None]:
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(y_train_dropped, rftrainpredictions)
mse_test = mean_squared_error(y_test_dropped, rfpredictions)

print('MSE Train: {:.3e}. MSE Test: {:.3e}'.format(mse_train, mse_test))

xgboost

In [None]:
#y_train_dropped = y_train.drop(dropped_indicestrain)
#y_test_dropped = y_test.drop(dropped_indicestest) X_test_nadropped X_train_nadropped
import xgboost as xgb
# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 2,
          'verbosity': 0}
dtrain = xgb.DMatrix(data=X_train_nadropped,
                     label=y_train_dropped)
xg_depth_2 = xgb.train(params=params, dtrain=dtrain)

In [None]:
# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 10,
          'verbosity': 0}
dtrain = xgb.DMatrix(data=X_train,
                     label=y_train)
xg_depth_10 = xgb.train(params=params, dtrain=dtrain)

In [None]:
# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 16,
          'verbosity': 0}
dtrain = xgb.DMatrix(data=X_train,
                     label=y_train)
xg_depth_16 = xgb.train(params=params, dtrain=dtrain)

In [None]:
params = {
          'max_depth': 5,
          'verbosity': 0,
         'learning_rate': 0.2,'n_estimators': 100}
dtrain = xgb.DMatrix(data=X_train,
                     label=y_train)

xg_best = xgb.train(params=params, dtrain=dtrain)


In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Assuming X, y are your features and target variable
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5,6,7],
    'n_estimators': [50, 100, 200]
}

xgb_model = xgb.XGBRegressor()

grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Best negative mean squared error
print("Best Negative MSE:", grid_search.best_score_)
#Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 50} Best Negative MSE: -8587613159.06636

# Evaluating results

In [None]:
from sklearn.metrics import mean_squared_error
#y_train, y_test
dtrain = xgb.DMatrix(data=X_train)
dtest = xgb.DMatrix(data=X_test)

# For each of 3 trained models
for model in [xg_depth_2, xg_depth_10, xg_depth_16,xg_best]:
    # Make predictions
    train_pred = model.predict(dtrain)
    test_pred = model.predict(dtest)

    # Calculate metrics
    mse_train = mean_squared_error(y_train, train_pred)
    mse_test = mean_squared_error(y_test, test_pred)

    print('MSE Train: {:.3e}. MSE Test: {:.3e}'.format(mse_train, mse_test))



In [None]:
import matplotlib.pyplot as plt

# Assuming you have a trained model 'rf' and data 'X_test' for predictions
predictions = test_pred


plt.scatter(y_test, y_test, c='blue', alpha=0.7, label='Actual')
plt.scatter(y_test, predictions, c='red', alpha=0.7, label='Predicted')

# Adding labels and title
plt.xlabel('Actual Values')
plt.ylabel('Predictions')
plt.title('Predictions vs. Actual Values')

# Adding legend
plt.legend()

# Show the plot
plt.show()

In [None]:
predictions=rfpredictions
plt.scatter(y_test_dropped, y_test_dropped, c='blue', alpha=0.7, label='Actual')
plt.scatter(y_test_dropped, predictions, c='red', alpha=0.7, label='Predicted')

# Adding labels and title
plt.xlabel('Actual Values')
plt.ylabel('Predictions')
plt.title('Predictions vs. Actual Values')

# Adding legend
plt.legend()

# Show the plot
plt.show()