# Advanced Web Scraping Mini-Project

This notebook demonstrates scraping attempts, API/GraphQL investigation notes, data cleaning, analysis, visualizations, and brand comparison charts using prepared CSVs (Mahindra, Toyota, Kia).

## Requirements
Run the following if you need to install libraries (in terminal):
```
pip install pandas matplotlib plotly seaborn
```

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set()
print('Libraries imported')

## 1. Notes on scraping AckoDrive
AckoDrive uses dynamic JS and protections; direct `requests` often returns no data. Use prepared CSVs or authorized APIs / phone-based DOM scraping.

In [None]:
# Example: requests attempt (illustrative)
import requests
from bs4 import BeautifulSoup
url = 'https://ackodrive.com/collection/mahindra+cars/'
try:
    r = requests.get(url, timeout=10)
    print('status', r.status_code)
    soup = BeautifulSoup(r.text, 'html.parser')
    print('title:', soup.title.string if soup.title else None)
except Exception as e:
    print('requests error', e)

## 2. Load prepared CSVs
Make sure CSVs are at `../data/` relative to this notebook.

In [None]:
import os
paths = {
    'Mahindra': '../data/Mahindra_cars_collection_Mumbai.csv',
    'Toyota': '../data/Toyota_cars_collection_Mumbai.csv',
    'Kia': '../data/Kia_cars_collection_Mumbai.csv'
}
for k,p in paths.items():
    print(k, os.path.exists(p), p)

In [None]:
dfs = {}
for k,p in paths.items():
    if os.path.exists(p):
        dfs[k] = pd.read_csv(p)
    else:
        dfs[k] = pd.DataFrame()
for k,df in dfs.items():
    print('\n---',k,'rows:', len(df))
    display(df.head())

## 3. Cleaning helper and apply

In [None]:
def clean_df(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    for col in ['Kilometers Driven','Year of Manufacture','Price','Number of owners']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
    if 'Location' in df.columns:
        df['Location'] = df['Location'].fillna('Mumbai')
    return df

for k in list(dfs.keys()):
    dfs[k] = clean_df(dfs[k])
    print(k, 'cleaned rows', len(dfs[k]))

## 4. Summary statistics per brand

In [None]:
summary = {}
for k,df in dfs.items():
    if df.empty:
        summary[k] = None
    else:
        summary[k] = {
            'count': len(df),
            'avg_price': int(df['Price'].dropna().mean()) if 'Price' in df.columns else None,
            'median_km': int(df['Kilometers Driven'].dropna().median()) if 'Kilometers Driven' in df.columns else None,
            'min_year': int(df['Year of Manufacture'].min()) if 'Year of Manufacture' in df.columns else None,
            'max_year': int(df['Year of Manufacture'].max()) if 'Year of Manufacture' in df.columns else None
        }
summary

In [None]:
import pandas as pd
pd.DataFrame.from_dict({k:v for k,v in summary.items() if v}, orient='index')

## 5. Visualizations per brand (interactive with Plotly)

In [None]:
for k,df in dfs.items():
    if df.empty:
        print('\n--',k,'no data --')
        continue
    print('\n###',k)
    fig = px.histogram(df, x='Year of Manufacture', title=f'{k} - Cars by Year')
    fig.show()
    fig2 = px.pie(df, names='Fuel Type', title=f'{k} - Fuel Type Distribution')
    fig2.show()
    fig3 = px.scatter(df, x='Kilometers Driven', y='Price', title=f'{k} - Price vs KM', hover_data=['Name'])
    fig3.show()

## 6. Brand comparison charts

In [None]:
# Prepare comparison data
brands = []
avg_prices = []
counts = []
fuel_types = set()
fuel_counts = {}

for brand, df in dfs.items():
    if df.empty:
        continue
    brands.append(brand)
    avg_prices.append(int(df['Price'].dropna().mean()))
    counts.append(len(df))
    vc = df['Fuel Type'].value_counts().to_dict()
    for ft, ct in vc.items():
        fuel_types.add(ft)
        fuel_counts.setdefault(ft, {})[brand] = ct

# Build fuel DataFrame
fuel_df = pd.DataFrame(0, index=brands, columns=sorted(list(fuel_types)))
for ft, brand_counts in fuel_counts.items():
    for brand, ct in brand_counts.items():
        fuel_df.loc[brand, ft] = ct

print('brands', brands)
display(pd.DataFrame({'brand':brands,'avg_price':avg_prices,'count':counts}))
display(fuel_df)

# Plot comparisons
fig = px.bar(x=brands, y=avg_prices, labels={'x':'Brand','y':'Avg Price'}, title='Average Price by Brand')
fig.show()
fig2 = px.bar(x=brands, y=counts, labels={'x':'Brand','y':'Count'}, title='Car Count by Brand')
fig2.show()
fig3 = px.bar(fuel_df, x=fuel_df.index, y=fuel_df.columns, title='Fuel Type Mix (stacked)')
fig3.update_layout(barmode='stack')
fig3.show()

## 7. Export cleaned combined CSV

In [None]:
combined = pd.concat([dfs[b] for b in dfs if not dfs[b].empty], ignore_index=True)
combined.to_csv('../data/combined_brands_cleaned.csv', index=False)
print('Saved:', '../data/combined_brands_cleaned.csv')