# **(ADD THE NOTEBOOK NAME HERE)**

## Objectives

* Write your notebook objective here, for example, "Fetch data from Kaggle and save as raw data", or "engineer features for modelling"

## Inputs

* Write here which data or information you need to run the notebook 

## Outputs

* Write here which files, code or artefacts you generate by the end of the notebook 

## Additional Comments

* In case you have any additional comments that don't fit in the previous bullets, please state them here. 


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Backup

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your original dataframe
df_missing = df.copy()
df_missing['BsmtExposure'] = df_missing['BsmtExposure'].fillna('Missing')

# One-hot encode BsmtExposure
df_encoded = pd.get_dummies(df_missing, columns=['BsmtExposure'], drop_first=True)

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr()

# Extract the correlations with SalePrice
correlation_with_saleprice = correlation_matrix['SalePrice'].drop('SalePrice')

# Plot the correlations with SalePrice
plt.figure(figsize=(10, 6))
sns.barplot(x=correlation_with_saleprice.values, y=correlation_with_saleprice.index, palette="viridis")
plt.title('Correlation of BsmtExposure Encoded Variables with SalePrice')
plt.xlabel('Correlation coefficient')
plt.ylabel('Encoded Variables')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

def correlation_to_sale_price_spearman(df, vars_to_study):
    """ Joint plots of variables vs SalePrice with Spearman correlation annotation """
    target_var = 'SalePrice'
    
    for col in vars_to_study:
        # Calculate Spearman correlation
        spearman_corr, p_value = spearmanr(df[col], df[target_var])
        
        # Create scatter plot with regression line
        g = sns.lmplot(data=df, x=col, y=target_var, line_kws={'color': 'red'})
        
        # Set the title and labels
        g.set_axis_labels(col, target_var, fontsize=15)
        g.fig.suptitle(f"{col} (Spearman: {spearman_corr:.2f}, p-value: {p_value:.2e})", fontsize=20, y=1.05)
        
        plt.show()
        print("\n\n")

In [None]:
def correlation_to_sale_price_joint(df, vars_to_study):
    """  Joint plots of variables vs SalePrice """
    target_var = 'SalePrice'
    for col in vars_to_study:
        x, y, hue = col, target_var, 'OverallQual'
        sns.jointplot(data=df, x=x, y=y, kind='hex')
        # sns.jointplot(data=df, x=x, y=y, hue=hue)
        plt.title(f"{col}", fontsize=20, y=1.3, x=-3)
        plt.show()
        print("\n\n")


correlation_to_sale_price_joint(df_eda, vars_to_study)

In [None]:
def correlation_to_sale_price_scat(df, vars_to_study):
    """  scatterplots of variables vs SalePrice """
    target_var = 'SalePrice'
    for col in vars_to_study:
        fig, axes = plt.subplots(figsize=(8, 5))
        axes = sns.scatterplot(data=df, x=col, y=target_var, hue='OverallQual')
        plt.title(f"{col}", fontsize=20, y=1.05)
        plt.show()
        print("\n\n")

correlation_to_sale_price_scat(df_eda, vars_to_study)

In [None]:
def correlation_to_sale_price_lm(df, vars_to_study):
    """  Joint plots of variables vs SalePrice """
    target_var = 'SalePrice'
    for col in vars_to_study:
        # fig, axes = plt.subplots(figsize=(8, 5))
        sns.lmplot(data=df, x=col, y=target_var)
        plt.title(f"{col}", fontsize=20, y=1.05)
        plt.show()
        print("\n\n")

correlation_to_sale_price_lm(df_eda, vars_to_study)

In [None]:
def correlation_to_sale_price_hist(df, vars_to_study):
    """ Display correlation plot between variables and sale price """
    target_var = 'SalePrice'
    for col in vars_to_study:
        fig, axes = plt.subplots(figsize=(8, 5))
        axes = sns.histplot(data=df, x=col, y=target_var)
        plt.title(f"{col}", fontsize=20, y=1.05)
        plt.show()
        print("\n\n")

correlation_to_sale_price_hist(df_eda, vars_to_study)

In [None]:
non_integer_values_dict = {}

for column in df.columns:
    # Check if all values in the column are integers
    if not df[column].apply(lambda x: isinstance(x, int)).all():
        # Collect non-integer values, filtering out floats that don't start with '0.'
        non_integer_values = df[column][~df[column].apply(lambda x: isinstance(x, int))]
        non_integer_values = non_integer_values[~non_integer_values.apply(lambda x: isinstance(x, float) and not str(x).startswith('0.'))]
        # Use a set to ensure uniqueness
        unique_non_integer_values = set(non_integer_values)
        non_integer_values_dict[column] = list(unique_non_integer_values)

# Print the results
for column, values in non_integer_values_dict.items():
    print(f"Non-integer values in {column}: {values}")

In [None]:
import matplotlib.pyplot as plt

# Fetch the top scores
pps_topscores = pps_matrix.iloc[19].sort_values(key=abs, ascending=False)[1:11]

# Print the values
print(pps_topscores)

# Plot the bar chart
plt.bar(x=pps_topscores.index, height=pps_topscores)
plt.xticks(rotation=90)
plt.title("Predictive Power Score", fontsize=20, y=1.05)

# Annotate the bars with the values
for index, value in enumerate(pps_topscores):
    plt.text(index, value, f'{value:.2f}', ha='center', va='bottom')

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your original dataframe
df_missing = df.copy()
df_missing['BsmtExposure'] = df_missing['BsmtExposure'].fillna('Missing')
df_missing['BsmtFinType1'] = df_missing['BsmtFinType1'].fillna('Missing')

# Calculate the mean SalePrice for each BsmtExposure category
mean_saleprice = df_missing.groupby('BsmtExposure')['SalePrice'].mean().reset_index()

# Pivot the dataframe for the heatmap
pivot_table = mean_saleprice.pivot("BsmtExposure", "SalePrice", "SalePrice")

# Create the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, fmt=".1f", cmap="YlGnBu")
plt.title('Average Sale Price by BsmtExposure')
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, linregress
from statsmodels.nonparametric.smoothers_lowess import lowess

# Sample mapping for KitchenQual
kitchen_qual_mapping = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}

# Transform the KitchenQual values to numerical values in the DataFrame
df['KitchenQual_num'] = df['KitchenQual'].map(kitchen_qual_mapping)

# Display the count of each category in KitchenQual
print(df['KitchenQual'].value_counts())

# Function to plot a variable against SalePrice with Pearson and Spearman trendlines
def plot_with_trendlines(df, vars, target='SalePrice'):
    num_vars = len(vars)
    plt.figure(figsize=(16, 6 * num_vars))
    
    for i, var in enumerate(vars, 1):
        x = df[var]
        y = df[target]
        
        # Pearson correlation
        pearson_coef, _ = pearsonr(x, y)
        slope_pearson, intercept_pearson, _, _, _ = linregress(x, y)
        line_pearson = slope_pearson * x + intercept_pearson
        
        # Spearman correlation
        spearman_coef, _ = spearmanr(x, y)
        lowess_smoothed = lowess(y, x, frac=0.3)
        
        # Plotting
        plt.subplot(num_vars, 1, i)
        sns.scatterplot(x=x, y=y, label='Data points')
        
        plt.plot(x, line_pearson, color='red', label=f'Pearson trendline (r={pearson_coef:.2f})')
        plt.plot(lowess_smoothed[:, 0], lowess_smoothed[:, 1], color='blue', label=f'Spearman trendline (r={spearman_coef:.2f})')
        
        plt.xlabel(var)
        plt.ylabel(target)
        plt.title(f'{var} vs {target} with Pearson and Spearman Trendlines')
        plt.legend()
    
    plt.tight_layout()
    plt.show()

# Example usage for multiple variables, including transformed KitchenQual
variables = ['YearBuilt', 'OverallQual', 'KitchenQual_num']
plot_with_trendlines(df, variables)

Section 1 content

---

# Section 2

Section 2 content

---

NOTE

* You may add as many sections as you want, as long as they support your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)

---

# Push files to Repo

* If you do not need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)
