# SALES ANALYSIS

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
import os
import calendar

In [None]:
import os
import pandas as pd


def combine_csv_files(folder_path):
    # Initialize an empty list to store DataFrames
    dfs = []

    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path, engine='python')

                # Append the DataFrame to the list
                dfs.append(df)
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

    if not dfs:
        print("No valid CSV files found.")
        return

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Write the combined DataFrame to a new CSV file
    combined_df.to_csv("combined_output.csv", index=False)

    print("Combined CSV file saved successfully.")


# Specify the folder path containing the CSV files
folder_path = "/Users/jaideepsai/Desktop/DATA-ANALYTICS/Data Source/extracted-data/RealEstate/sales_properties"

# Combine CSV files
combine_csv_files(folder_path)


In [None]:
sales_path = "combined_output.csv"
sales_data = pd.read_csv(sales_path, delimiter=';')

In [None]:
sales_data.head()

In [None]:
sales_data.shape

In [None]:
sales_data.info()

In [None]:
sales_data.isnull().sum()

In [None]:
print(sales_data.columns)

In [None]:
sales_data = sales_data.drop(['Days On Zillow', 'Country', 'sgapt', 'Lot Size', 'isZillowOwned'], axis=1)

In [None]:
sales_data.isnull().sum()

In [None]:
mean_values = sales_data[['Zestimate', 'Rent Zestimate']].mean()
sales_data.loc[:, 'Zestimate'] = sales_data['Zestimate'].fillna(mean_values['Zestimate'])
sales_data.loc[:, 'Rent Zestimate'] = sales_data['Rent Zestimate'].fillna(mean_values['Rent Zestimate'])
sales_data['Broker Name'].fillna("unknown", inplace=True)

In [None]:
sales_data.isnull().sum()

In [None]:
avg_by_zip = sales_data.groupby('Zip')['Price'].mean()

# Plotting the average property price by postal code
plt.figure(figsize=(10, 6))
avg_by_zip.plot(kind='bar', color='teal')
plt.title('Average Property Price by Zip Code')
plt.xlabel('Zip Code')
plt.ylabel('Average Price')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability # Add grid lines on the y-axis
plt.tight_layout()
plt.show()


In [None]:
amenties = ['Bedrooms', 'Bathrooms', 'Living Area', 'Home Type']
plt.figure(figsize=(12, 10))
for i, amenties in enumerate(amenties, 1):
    plt.subplot(3, 2, i)
    if sales_data[amenties].dtype == 'object':
        sns.countplot(x=amenties, data=sales_data)
        plt.title(f'Distribution of {amenties}')
        plt.xlabel(amenties)
        plt.ylabel('Count')
        
    else:  # For numerical variables
        sns.histplot(x=amenties, data=sales_data, kde=True)
        plt.title(f'Distribution of {amenties}')
        plt.xlabel(amenties)
        plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Zip', hue='statusText', data=sales_data, palette='Set3')
plt.title('Property Status by Postal Code')
plt.xlabel('Postal Code')
plt.ylabel('Count')
plt.xticks(rotation=45)  
plt.legend(title='Status')
plt.grid(axis='y') 
plt.tight_layout()
plt.show()

# SALES ANALYSIS WITH POOL

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
import os
import calendar

In [None]:
import os
import pandas as pd


def combine_csv_files(folder_path):
    # Initialize an empty list to store DataFrames
    dfs = []

    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path, engine='python')

                # Append the DataFrame to the list
                dfs.append(df)
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

    if not dfs:
        print("No valid CSV files found.")
        return

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Write the combined DataFrame to a new CSV file
    combined_df.to_csv("properties_with_pool.csv", index=False)

    print("Combined CSV file saved successfully.")


# Specify the folder path containing the CSV files
folder_path = "/Users/jaideepsai/Desktop/DATA-ANALYTICS/Data Source/extracted-data/RealEstate/sales_properties_pools"

# Combine CSV files
combine_csv_files(folder_path)


In [None]:
sales_pools_path = "properties_with_pool.csv"
sales_pools_data = pd.read_csv(sales_pools_path, delimiter=';')

In [None]:
sales_pools_data.head()

In [None]:
sales_pools_data['pool']=1

In [None]:
sales_pools_data.head()

In [None]:
all_zips = sales_pools_data['Zip'].unique()
pool_properties = sales_pools_data[sales_pools_data['pool'] == 1]
plt.figure(figsize=(10, 6))
sns.countplot(x='Zip', data=sales_pools_data, order=all_zips, palette='Reds')
plt.title('Number of Properties with a Pool by Zip Code')
plt.xlabel('Zip Code')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(axis='y')  # Add grid lines on the y-axis
plt.tight_layout()
plt.show()