In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files  # to enable file download in Colab

# ------------------------------
# 1. Data Loading
# ------------------------------
# Upload the file if not already in the environment:
# files.upload()  # Uncomment this line to prompt file upload (if needed)

# Load the EV Maker by Place dataset (adjust the file path/name as needed)
ev_maker = pd.read_csv("EV Maker by Place.csv")  # Ensure the file name matches the uploaded file
print("EV Maker by Place Data:")
print(ev_maker.head())

# ------------------------------
# 2. Data Pre-Processing
# ------------------------------
# Standardize column names: remove spaces, lower case the names etc.
ev_maker.columns = [col.strip().lower().replace(' ', '_') for col in ev_maker.columns]
print("\nStandardized Column Names:")
print(ev_maker.columns)

# Check for any missing values (Optional)
print("\nMissing Values in Each Column:")
print(ev_maker.isnull().sum())

# ------------------------------
# 3. Exploratory Data Analysis (EDA) & Visualization
# ------------------------------
# Example: Visualize the number of EV Makers by Place.
# It is assumed that the dataset has a column called "place".
# If your column name is different, update 'place' in the groupby() function accordingly.

# Group data by place and count the number of entries (EV makers) per place
ev_maker_count = ev_maker.groupby('place').size().reset_index(name='count')
print("\nCount of EV Makers by Place:")
print(ev_maker_count)

# Create a bar chart for visualization
plt.figure(figsize=(10, 6))
plt.bar(ev_maker_count['place'], ev_maker_count['count'], color='skyblue')
plt.title("Number of EV Makers by Place")
plt.xlabel("Place")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure to a file (downloadable)
save_path = "ev_makers_distribution.png"
plt.savefig(save_path, dpi=300)
plt.show()

# ------------------------------
# 4. Downloading the Visualization
# ------------------------------
# This command will prompt a download of the generated image file.
files.download(save_path)


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Read the Excel file with header row at index 2 (third row as header)
df = pd.read_excel("Details_of_Public_Charging_Stations_Installed.xlsx", header=2)

# Extract the three groups based on the file structure:
df1 = df.iloc[:, 0:3].copy()
df2 = df.iloc[:, 4:7].copy()
df3 = df.iloc[:, 8:11].copy()

# Standardize column names for each group using the desired names
new_cols = ['State', 'City', 'No. of PCS installed']
df1.columns = new_cols
df2.columns = new_cols
df3.columns = new_cols

# Concatenate the three groups into one DataFrame
df_combined = pd.concat([df1, df2, df3], ignore_index=True)

# Convert the "No. of PCS installed" column to numeric (coercing errors to NaN)
df_combined['No. of PCS installed'] = pd.to_numeric(df_combined['No. of PCS installed'], errors='coerce')

# Drop rows with missing City or "No. of PCS installed" values
df_clean = df_combined.dropna(subset=['City', 'No. of PCS installed']).reset_index(drop=True)

# Trim any extra spaces in text columns
df_clean['City'] = df_clean['City'].str.strip()
df_clean['State'] = df_clean['State'].str.strip()

print("Cleaned and Standardized Dataset:")
print(df_clean.head())
print("\nDataset Info:")
print(df_clean.info())


In [None]:
# Group the cleaned data by City and sum the number of PCS installed
charging_count = df_clean.groupby("City")["No. of PCS installed"].sum().reset_index()

# Plot a bar chart
plt.figure(figsize=(10, 6))
plt.bar(charging_count["City"], charging_count["No. of PCS installed"], color='lightgreen')
plt.title("Total Public Charging Stations by City")
plt.xlabel("City")
plt.ylabel("Total No. of PCS Installed")
plt.xticks(rotation=45)
plt.tight_layout()

# Save and download the chart as a PNG image
save_path = "charging_stations_by_city.png"
plt.savefig(save_path, dpi=300)
plt.show()

files.download(save_path)


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

# Load the 2-Wheeler EV dataset
df_2w = pd.read_csv("2-wheeler-EV-bikewale.csv")
df_2w.columns = [col.strip().lower().replace(' ', '_') for col in df_2w.columns]

print("2-Wheeler EV Dataset Head:")
print(df_2w.head())

print("\nMissing Values in Each Column:")
print(df_2w.isnull().sum())

# 1. Histogram for Rating Distribution
plt.figure(figsize=(10, 6))
# Assuming ratings are integers; adjust bins if needed
plt.hist(df_2w['rating'].dropna(), bins=range(int(df_2w['rating'].min()), int(df_2w['rating'].max()) + 2),
         color='lightblue', edgecolor='black')
plt.title("Rating Distribution for 2-Wheeler EVs")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.tight_layout()
save_path1 = "2wheeler_rating_distribution.png"
plt.savefig(save_path1, dpi=300)
plt.show()
files.download(save_path1)

# 2. Average Rating by Model Name
if 'model_name' in df_2w.columns:
    avg_rating_model = df_2w.groupby("model_name")["rating"].mean().reset_index()

    plt.figure(figsize=(10, 6))
    plt.bar(avg_rating_model["model_name"], avg_rating_model["rating"], color='coral')
    plt.title("Average Rating by Model")
    plt.xlabel("Model Name")
    plt.ylabel("Average Rating")
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_path2 = "2wheeler_avg_rating_by_model.png"
    plt.savefig(save_path2, dpi=300)
    plt.show()
    files.download(save_path2)
else:
    print("Column 'model_name' not found in the dataset.")

# 3. Scatter Plot: Visual Appeal vs Reliability
if ('visual_appeal' in df_2w.columns) and ('reliability' in df_2w.columns):
    plt.figure(figsize=(10, 6))
    plt.scatter(df_2w['visual_appeal'], df_2w['reliability'], color='green', alpha=0.6)
    plt.title("Visual Appeal vs Reliability for 2-Wheeler EVs")
    plt.xlabel("Visual Appeal")
    plt.ylabel("Reliability")
    plt.tight_layout()
    save_path3 = "2wheeler_visual_vs_reliability.png"
    plt.savefig(save_path3, dpi=300)
    plt.show()
    files.download(save_path3)
else:
    print("Required columns for scatter plot not found in the dataset.")


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import re

# Function to extract the first numeric value from a string (e.g., the lower bound of a range)
def extract_first_number(s):
    if pd.isnull(s):
        return None
    match = re.search(r'[\d\.]+', s)
    if match:
        return float(match.group())
    return None

# Load the EV Cars dataset
df_cars = pd.read_csv("EV_cars_India_2023.csv")

# Standardize column names: lower-case, strip extra spaces and replace spaces with underscores
df_cars.columns = [col.strip().lower().replace(' ', '_') for col in df_cars.columns]

print("EV Cars Dataset Head:")
print(df_cars.head())
print("\nMissing Values in Each Column:")
print(df_cars.isnull().sum())

# Extract numeric values from textual columns
df_cars["price_numeric"] = df_cars["car_price"].apply(extract_first_number)
df_cars["range_numeric"] = df_cars["drive_range"].apply(extract_first_number)
df_cars["battery_numeric"] = df_cars["batter_cap"].apply(extract_first_number)

print("\nSample Extracted Numeric Values:")
print(df_cars[["car_price", "price_numeric", "drive_range", "range_numeric", "batter_cap", "battery_numeric"]].head())

# Visualization 1: Histogram for Car Price Distribution (using the extracted lower bound)
plt.figure(figsize=(10, 6))
plt.hist(df_cars["price_numeric"].dropna(), bins=10, color='lightblue', edgecolor='black')
plt.title("Car Price Distribution (in lakh)")
plt.xlabel("Car Price (lakh)")
plt.ylabel("Frequency")
plt.tight_layout()
save_path1 = "ev_cars_price_distribution.png"
plt.savefig(save_path1, dpi=300)
plt.show()
files.download(save_path1)

# Visualization 2: Scatter Plot of Car Price vs Drive Range
plt.figure(figsize=(10, 6))
plt.scatter(df_cars["price_numeric"], df_cars["range_numeric"], color='green', alpha=0.7)
plt.title("Car Price vs Drive Range")
plt.xlabel("Car Price (lakh)")
plt.ylabel("Drive Range (km)")
plt.tight_layout()
save_path2 = "ev_cars_price_vs_drive_range.png"
plt.savefig(save_path2, dpi=300)
plt.show()
files.download(save_path2)

# Visualization 3: Scatter Plot of Battery Capacity vs Drive Range
plt.figure(figsize=(10, 6))
plt.scatter(df_cars["battery_numeric"], df_cars["range_numeric"], color='orange', alpha=0.7)
plt.title("Battery Capacity vs Drive Range")
plt.xlabel("Battery Capacity (kWh)")
plt.ylabel("Drive Range (km)")
plt.tight_layout()
save_path3 = "ev_cars_battery_vs_drive_range.png"
plt.savefig(save_path3, dpi=300)
plt.show()
files.download(save_path3)


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

# Load the ev_battery_charging_data CSV file
df_battery = pd.read_csv("ev_battery_charging_data.csv")

# Standardize column names: lower-case, strip extra spaces, and replace spaces with underscores
df_battery.columns = [col.strip().lower().replace(' ', '_') for col in df_battery.columns]

print("ev_battery_charging_data Dataset Head:")
print(df_battery.head())
print("\nMissing Values in Each Column:")
print(df_battery.isnull().sum())

# Rename the charging time column from "charging_duration_(min)" to "charging_time" for ease of use
if "charging_duration_(min)" in df_battery.columns:
    df_battery.rename(columns={"charging_duration_(min)": "charging_time"}, inplace=True)

# Convert key columns to numeric (adjust these column names if needed)
# Convert "charging_time" and "battery_capacity" columns to numeric
df_battery["charging_time"] = pd.to_numeric(df_battery["charging_time"], errors='coerce')
if "battery_capacity" in df_battery.columns:
    df_battery["battery_capacity"] = pd.to_numeric(df_battery["battery_capacity"], errors='coerce')
else:
    print("Column 'battery_capacity' not found in dataset.")

# Visualization 1: Histogram for Charging Time Distribution
plt.figure(figsize=(10, 6))
plt.hist(df_battery["charging_time"].dropna(), bins=20, color='purple', edgecolor='black')
plt.title("Distribution of Charging Time")
plt.xlabel("Charging Time (minutes)")
plt.ylabel("Frequency")
plt.tight_layout()
save_path1 = "battery_charging_time_distribution.png"
plt.savefig(save_path1, dpi=300)
plt.show()
files.download(save_path1)

# Visualization 2: Scatter Plot of Battery Capacity vs. Charging Time
if "battery_capacity" in df_battery.columns and "charging_time" in df_battery.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(df_battery["charging_time"], df_battery["battery_capacity"], color='blue', alpha=0.6)
    plt.title("Battery Capacity vs Charging Time")
    plt.xlabel("Charging Time (minutes)")
    plt.ylabel("Battery Capacity (kWh)")
    plt.tight_layout()
    save_path2 = "battery_capacity_vs_charging_time.png"
    plt.savefig(save_path2, dpi=300)
    plt.show()
    files.download(save_path2)
else:
    print("Required columns for scatter plot not found in the dataset.")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# --- Prepare the data for ML ---

# Assume df_cars is already loaded and has the extracted numeric columns:
# "price_numeric", "battery_numeric", "range_numeric"
# We'll drop any rows where any of these three are missing.
df_ml = df_cars[['price_numeric', 'battery_numeric', 'range_numeric']].dropna()
print("Data for ML (first 5 rows):")
print(df_ml.head())

# Define features and target:
X = df_ml[['price_numeric', 'battery_numeric']]
y = df_ml['range_numeric']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Linear Regression Model ---
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)

mse_lin = mean_squared_error(y_test, y_pred_lin)
r2_lin = r2_score(y_test, y_pred_lin)

print("Linear Regression Performance:")
print("Mean Squared Error:", mse_lin)
print("R^2 Score:", r2_lin)

# --- Random Forest Regression Model ---
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regression Performance:")
print("Mean Squared Error:", mse_rf)
print("R^2 Score:", r2_rf)

# --- Visualization: Actual vs. Predicted Drive Range for Linear Regression ---
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lin, color='blue', alpha=0.7, label="Predicted")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label="Ideal Fit")
plt.title("Linear Regression: True vs. Predicted Drive Range")
plt.xlabel("True Drive Range (km)")
plt.ylabel("Predicted Drive Range (km)")
plt.legend()
plt.tight_layout()
plt.show()

# If you need to save and download the plot:
save_path = "linear_regression_true_vs_predicted.png"
plt.savefig(save_path, dpi=300)
# Uncomment the next line if running in Colab to trigger download:
# from google.colab import files; files.download(save_path)
