In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt

def clone_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repo_url}...")
        os.system(f"git clone {repo_url}")
    else:
        print(f"Repository {repo_name} already cloned.")
    return repo_name

# Clone the New York City Airbnb Open Data repository
repo_url = "https://github.com/ine-rmotr-projects/DCP-new-york-city-airbnb-open-data.git"
repo_name = clone_repo(repo_url)

# Locate the dataset file
possible_files = ["AB_NYC_2019.csv", "data/AB_NYC_2019.csv"]
data_file = None
for file in possible_files:
    file_path = os.path.join(repo_name, file)
    if os.path.exists(file_path):
        data_file = file_path
        break

if not data_file:
    raise FileNotFoundError("Data file not found in the repository. Please check the repository structure.")

# Load the dataset
data = pd.read_csv(data_file)

# Preview the data
print("Preview of the dataset:")
print(data.head())

# Display basic information about the dataset
print("\nDataset information:")
print(data.info())

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing values:")
print(missing_values)

# Data Cleaning
# 1. Fill missing values in 'reviews_per_month' with 0
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

# 2. Drop rows with missing values in critical columns like 'name' or 'host_name'
data = data.dropna(subset=['name', 'host_name'])

print("\nData cleaned. Remaining missing values:")
print(data.isnull().sum())

# Analyze the dataset
# 1. Distribution of Airbnb listings by neighborhood group
neighborhood_group_counts = data['neighbourhood_group'].value_counts()
print("\nListings by neighborhood group:")
print(neighborhood_group_counts)

# 2. Average price by neighborhood group
average_price_by_group = data.groupby('neighbourhood_group')['price'].mean()
print("\nAverage price by neighborhood group:")
print(average_price_by_group)

# 3. Top 5 most expensive neighborhoods
top_expensive_neighborhoods = data.groupby('neighbourhood')['price'].mean().sort_values(ascending=False).head(5)
print("\nTop 5 most expensive neighborhoods:")
print(top_expensive_neighborhoods)

# Visualization
# 1. Plot distribution of listings by neighborhood group
neighborhood_group_counts.plot(kind='bar', title="Distribution of Listings by Neighborhood Group", figsize=(8, 6))
plt.xlabel("Neighborhood Group")
plt.ylabel("Number of Listings")
plt.show()

# 2. Plot average price by neighborhood group
average_price_by_group.plot(kind='bar', title="Average Price by Neighborhood Group", color='orange', figsize=(8, 6))
plt.xlabel("Neighborhood Group")
plt.ylabel("Average Price")
plt.show()

# 3. Plot top 5 most expensive neighborhoods
top_expensive_neighborhoods.plot(kind='bar', title="Top 5 Most Expensive Neighborhoods", color='green', figsize=(8, 6))
plt.xlabel("Neighborhood")
plt.ylabel("Average Price")
plt.show()

# Save processed data to a new file (optional)
output_file = os.path.join(repo_name, "processed_airbnb_data.csv")
data.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")



Cloning repository from https://github.com/ine-rmotr-projects/RDP-health-and-obesity-trends.git...


FileNotFoundError: Data file not found: RDP-health-and-obesity-trends/data/obesity-cleaned.csv