In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

I imported the necessary libraries:
*  Pandas is used for data manipulation
*  Numpy is used for arrays and solving mathematical issues
*  Matplotlib and seaborn are libraries used in vizualization of data(histogram, distribution plot, relationship plot)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

- From the google.colab library I imported the function drive in order to fetch data and mount from the alocated file

In [None]:
real = pd.read_csv('/content/drive/My Drive/Real Estate Asset Management Data_Migrated Data (1).csv')

- I gave my data var name "real" and used the funtion pd to get my data from my drive and into the notebook

**EDA OF REAL ESTATE**

-Cleaning and preparing my data for further analysis

In [None]:
#View columns
x = real.columns
for i in x:
  print(i)

- Viewed my column names in a vertical order

In [None]:
#check for nulls
real.isnull().sum()

- Checking if my data had any missing values

In [None]:
#check fofr duplicates
real.duplicated().sum()

- Checking if my data had any duplicates

In [None]:
#Statistical data
real.describe()

In [None]:
#drop the duplicates
new_real = real.drop_duplicates()
print(new_real)

In [None]:
new_real.shape

In [None]:
real.describe()

In [None]:
new_real.describe()

In [None]:
#list of columns to check outliers
columns = ['GLA Sq Ft','Gross Value','Occupied SqFt','Vacant Sq Ft']


In [None]:
#create a funtion to clean outliers
def clean_outliers(column):
  mean = new_real[column].mean()
  std = new_real[column].std()
  threshold = 3
  lower_limit = mean - threshold * std
  upper_limit = mean + threshold * std
  return new_real[(new_real[column] >= lower_limit) & (new_real[column] <= upper_limit)]



Create a new function by using def for definining our variable

Lower/Upper limit = mean+-threshold *std

In [None]:
for column in columns:
  new_clean_data = clean_outliers(column)


In [None]:
print(new_clean_data)

In [None]:
real.shape

In [None]:
new_clean_data.shape

In [None]:
#view datatypes
real.dtypes

- Viewing datatypes of each column in order to categorize it

In [None]:
#categorizing the data
categorical_column=[]
non_categorical_column=[]

for column in real.columns:
  if real[column].dtype == 'object':
    categorical_column.append(column)
  else:
    non_categorical_column.append(column)

print("Categorical column:", categorical_column)

print("\nnon categorical column:")
print(non_categorical_column)

- Grouping my data into categorical and non categorical data

In [None]:
real['Property Type'].unique()

- Used .unique to check the elements under property type

In [None]:
sns.histplot(data = real, x = 'Property Type')

In [None]:
#statistical data
new_clean_data.describe()

- Viewed the statistical data of the dataset

In [None]:
#viewing the dtypes
new_clean_data.info()

-Viewed the datatypes

In [None]:
#Encode the columns
from sklearn.preprocessing import LabelEncoder


- During encoding i imported a library to help with encoding the elements to codes in order to enhance data security in my dataset

In [None]:
encoded_columns = ['Asset Manager', 'Property Name', 'Property Type', 'Average Occupancy %']

le = LabelEncoder()
for column in encoded_columns:
  new_clean_data[column] = le.fit_transform(new_clean_data[column])
  print(new_clean_data[column])

In [None]:
new_clean_data['Average Occupancy %'].unique()

In [None]:
new_clean_data.head()

-I put my elements in to one variable and went ahead to encode each column in each element

**EDA VISUALIZATION**

In [None]:
#correlation of the data
selected_columns = ['GLA Sq Ft', 'Gross Value', 'Occupied SqFt', 'Vacant Sq Ft']
correlation_matrix = real[selected_columns].corr()
print(correlation_matrix)

In [None]:
#heatmap of the data
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Real Estate')
plt.show()

In [None]:
#barplot for rel between ptype and gla
sns.barplot(data = new_clean_data, x = 'Property Type', y = 'GLA Sq Ft',)

In [None]:
#distribution plot between the variables
sns.displot(new_clean_data, x='Gross Value')

In [None]:
sns.relplot(data = new_clean_data, x ='GLA Sq Ft' , y = 'Vacant Sq Ft')

In [None]:
#view property type
new_clean_data['Property Type'].unique()

In [None]:
#relationship between GLA Sq Ft and Occupied SqFt
sns.relplot(data = new_clean_data, x = 'GLA Sq Ft', y = 'Occupied SqFt', hue = 'Property Type', kind = 'line')

- Industrial properties were the most occupied
- Office followed with an occupied sq ft of less than 200000
- Retail had the least occupied ft

In [None]:
sns.relplot(data = real, x = 'Gross Value', y = 'Occupied SqFt', hue = 'Property Type', kind = 'line')

In [None]:
#Histogram of property type
sns.histplot(data = real, x = 'Property Type')

**ML: Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import  r2_score

In [None]:
#give x and y variables
x = new_clean_data[['GLA Sq Ft', 'Occupied SqFt', 'Vacant Sq Ft']]
y = new_clean_data[['Gross Value']]
#split to training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
#variable name for your model
model = LinearRegression()
#train the model on the data
model.fit(x_train, y_train)
#make predictions
y_pred = model.predict(x_test)
#calculate the accuracy
r2 = r2_score(y_test, y_pred)
print(r2)




In [None]:
#using ridge
from sklearn.linear_model import Ridge

x = new_clean_data[['GLA Sq Ft', 'Occupied SqFt', 'Vacant Sq Ft']]
y = new_clean_data[['Gross Value']]
#split to training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
#model allocation
alpha =1.0
model = Ridge(alpha = alpha)

#train the model on the data
model.fit(x_train, y_train)
#make predictions
y_pred = model.predict(x_test)
#calculate the accuracy
r2 = r2_score(y_test, y_pred)
print(r2)


**ML:DECISION TREES**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Give x and Y
x = new_clean_data[['Asset Manager','Property Type','GLA Sq Ft', 'Occupied SqFt', 'Vacant Sq Ft','Gross Value']]
y = new_clean_data[['Average Occupancy %']]
#split to training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
#Shape of train and test
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
real.shape

In [None]:
real.head()

In [None]:
new_clean_data.head()

In [None]:
new_clean_data.shape

In [None]:
#fit model
model = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=0)
model.fit(x_train, y_train)
#predict y
y_pred = model.predict(x_test)
#calculate accuracy
r2 = r2_score(y_test, y_pred)
print(r2)


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test, y_pred)


In [None]:
print(accuracy)