In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Load data
df = pd.read_csv('Pollution.csv')

# Encode categorical features
le_country = LabelEncoder()
df['Country_enc'] = le_country.fit_transform(df['Country'])
le_city = LabelEncoder()
df['City_enc'] = le_city.fit_transform(df['City'])

aqi_cat_map = {
    'Good': 0,
    'Moderate': 1,
    'Unhealthy for Sensitive Groups': 2,
    'Unhealthy': 3,
    'Very Unhealthy': 4,
    'Hazardous': 5
}
for col in ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']:
    df[col + '_enc'] = df[col].map(aqi_cat_map)

# Features and target
features = [
    'CO AQI Value', 'CO AQI Category_enc',
    'Ozone AQI Value', 'Ozone AQI Category_enc',
    'NO2 AQI Value', 'NO2 AQI Category_enc',
    'PM2.5 AQI Value', 'PM2.5 AQI Category_enc',
    'Country_enc', 'City_enc'
]
target = 'AQI Category_enc'

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Model training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=aqi_cat_map.keys()))
print(confusion_matrix(y_test, y_pred))


In [None]:
df.info()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('Pollution.csv')

# Group by country and calculate the mean AQI Value
country_aqi = df.groupby('Country')['AQI Value'].mean()

# Drop any missing values that may have resulted
country_aqi = country_aqi.dropna()

# Find the highest and lowest polluted countries
highest_polluted_country = country_aqi.idxmax()
lowest_polluted_country = country_aqi.idxmin()
highest_aqi = country_aqi.max()
lowest_aqi = country_aqi.min()

# Prepare data for plotting
countries = [lowest_polluted_country, highest_polluted_country]
aqi_values = [lowest_aqi, highest_aqi]

# Plot
plt.figure(figsize=(8, 5))
bars = plt.bar(countries, aqi_values, color=['green', 'red'])
plt.title('Lowest vs Highest Polluted Country (Average AQI)')
plt.ylabel('Average AQI Value')
plt.xlabel('Country')

# Annotate bars with AQI values
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 2, round(yval, 1), ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Pollution.csv')

country_avg = df.groupby('Country')['AQI Value'].mean().sort_values(ascending=False)
top_20 = country_avg.head(20)
bottom_20 = country_avg.tail(20).sort_values(ascending=True)

# Use the correct style or comment it out
plt.style.use('seaborn-v0_8')  # or comment this line out

# Top 20
plt.figure(figsize=(14, 8))
plt.barh(top_20.index[::-1], top_20.values[::-1], color='red')
plt.title('Top 20 Most Polluted Countries (Average AQI)')
plt.xlabel('Average AQI Value')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Bottom 20
plt.figure(figsize=(14, 8))
plt.barh(bottom_20.index, bottom_20.values, color='green')
plt.title('20 Least Polluted Countries (Average AQI)')
plt.xlabel('Average AQI Value')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('Pollution.csv')

# Count the number of records for each country
country_counts = df['Country'].value_counts()

# If there are many countries, show only the top 10 and group the rest as 'Other'
top_n = 10
top_countries = country_counts.head(top_n)
other_count = country_counts[top_n:].sum()
labels = list(top_countries.index) + (['Other'] if other_count > 0 else [])
sizes = list(top_countries.values) + ([other_count] if other_count > 0 else [])

# Plot the pie chart
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, counterclock=False)
plt.title('Distribution of Records by Country')
plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Pollution.csv')

# Count of each AQI Category
aqi_counts = df['AQI Category'].value_counts()
plt.figure(figsize=(8,5))
aqi_counts.plot(kind='bar', color='skyblue')
plt.title('Count of Each AQI Category')
plt.xlabel('AQI Category')
plt.ylabel('Number of Records')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

corr = df[['AQI Value', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation between AQI and Pollutants')
plt.show()
