<a href="https://colab.research.google.com/github/2303A52310/AIML_2303A52310/blob/main/AIML_FinalExam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Load the dataset
df = pd.read_csv("/content/gender+by+name.zip")

# Q1. Identify the Top-10 Popular Names Given to Babies
top_10_names = df.groupby("Name")["Count"].sum().sort_values(ascending=False).head(10)
print("Top-10 Popular Names:\n", top_10_names)

# Q2. Find the Max and Min Count of Babies with Similar Names
name_counts = df.groupby("Name")["Count"].sum()
max_count = name_counts.max()
min_count = name_counts.min()
print(f"Max Count: {max_count}, Min Count: {min_count}")

# Q3. Name the Gender with the Most Names Among the Babies
gender_counts = df.groupby("Gender")["Name"].nunique()
most_names_gender = gender_counts.idxmax()
print(f"Gender with the most names: {most_names_gender}")

# Q4. Identify the Top-5 Probabilities of Male and Female Babies
total_count = df["Count"].sum()
gender_probabilities = (
    df.groupby("Gender")["Count"].sum() / total_count
).sort_values(ascending=False)
print("Probabilities of Male and Female:\n", gender_probabilities)

# Q5. Name the Top-5 Names with Highest Probabilities
df["Total Count"] = df.groupby("Name")["Count"].transform("sum")
df["Probability"] = df["Count"] / df["Total Count"]
top_5_names_probabilities = (
    df.groupby("Name")["Probability"].mean().sort_values(ascending=False).head(5)
)
print("Top-5 Names with Highest Probabilities:\n", top_5_names_probabilities)

# Q6. Apply Classification Model to Predict Gender by Name Length
# Add a feature for name length
df["Name Length"] = df["Name"].apply(len)

# Convert Gender to binary (Male=1, Female=0)
df["Gender_Binary"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)

# Features and target
X = df[["Name Length"]]
y = df["Gender_Binary"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Top-10 Popular Names:
 Name
James      5328370
John       5282978
Robert     4990971
Michael    4602810
William    4242917
Mary       4184969
David      3801028
Joseph     2707061
Richard    2647959
Charles    2446151
Name: Count, dtype: int64
Max Count: 5328370, Min Count: 1
Gender with the most names: F
Probabilities of Male and Female:
 Gender
M    0.505681
F    0.494319
Name: Count, dtype: float64
Top-5 Names with Highest Probabilities:
 Name
Kim-Maree    1.0
Munawara     1.0
Muneo        1.0
Munender     1.0
Muneerah     1.0
Name: Probability, dtype: float64
Model Accuracy: 1.00
Mean Squared Error: 0.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     44181

    accuracy                           1.00     44181
   macro avg       1.00      1.00      1.00     44181
weighted avg       1.00      1.00      1.00     44181

