<a href="https://colab.research.google.com/github/AshleyNyaboke/daily-data-drills/blob/main/prediction_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate 400 rows of data
n_rows = 400

# Generate gender data (approximately 50% male, 50% female)
genders = np.random.choice(['male', 'female'], size=n_rows, p=[0.5, 0.5])

# Generate height data with different distributions for male and female
heights = np.zeros(n_rows)
weights = np.zeros(n_rows)

for i in range(n_rows):
    if genders[i] == 'male':
        # Male height: mean 175cm, std 7cm
        heights[i] = np.random.normal(175, 7)
        # Male BMI: mean 23, std 3
        bmi = np.random.normal(23, 3)
    else:
        # Female height: mean 162cm, std 6cm
        heights[i] = np.random.normal(162, 6)
        # Female BMI: mean 22, std 3
        bmi = np.random.normal(22, 3)

    # Calculate weight using BMI formula: weight = BMI * (height/100)^2
    weights[i] = bmi * (heights[i]/100) ** 2

# Add some random noise to weights
weights += np.random.normal(0, 1.5, n_rows)

# Round to reasonable precision
heights = np.round(heights, 1)
weights = np.round(weights, 1)

# Create index column
indices = range(1, n_rows + 1)

# Create DataFrame
df = pd.DataFrame({
    'index': indices,
    'gender': genders,
    'height': heights,
    'weight': weights
})

# Reorder columns to have gender after index
df = df[['index', 'gender', 'height', 'weight']]

# Save to CSV
df.to_csv('height_weight_gender_dataset.csv', index=False)

print("Dataset generated successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nGender distribution:")
print(df['gender'].value_counts())
print(f"\nFirst 10 rows:")
print(df.head(10))
print(f"\nDataset statistics by gender:")
print(df.groupby('gender').describe())

Dataset generated successfully!
Dataset shape: (400, 4)

Gender distribution:
gender
female    207
male      193
Name: count, dtype: int64

First 10 rows:
   index  gender  height  weight
0      1    male   184.1    78.4
1      2  female   166.1    58.7
2      3  female   163.9    61.7
3      4  female   162.6    62.8
4      5    male   169.3    84.2
5      6    male   168.0    56.2
6      7    male   183.1    86.7
7      8  female   165.7    67.4
8      9  female   161.9    51.6
9     10  female   162.5    51.0

Dataset statistics by gender:
        index                                                          height  \
        count        mean         std  min    25%    50%    75%    max  count   
gender                                                                          
female  207.0  206.333333  113.282275  2.0  113.5  212.0  298.0  400.0  207.0   
male    193.0  194.243523  118.037987  1.0   91.0  182.0  301.0  398.0  193.0   

                    ...               weight 

In [11]:

df

Unnamed: 0,index,gender,height,weight
0,1,male,184.1,78.4
1,2,female,166.1,58.7
2,3,female,163.9,61.7
3,4,female,162.6,62.8
4,5,male,169.3,84.2
...,...,...,...,...
395,396,female,168.6,55.9
396,397,female,165.7,50.0
397,398,male,170.6,83.4
398,399,female,160.9,57.5


In [14]:
df.describe()

Unnamed: 0,index,height,weight
count,400.0,400.0,400.0
mean,200.5,168.70475,64.5205
std,115.614301,8.913559,11.318146
min,1.0,147.2,34.7
25%,100.75,161.875,55.975
50%,200.5,168.1,64.05
75%,300.25,175.3,72.25
max,400.0,196.6,101.8


In [13]:
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [15]:
jt = LabelEncoder()
df['gender'] = jt.fit_transform(df['gender'])

In [16]:
!pip install numpy



In [17]:
df

Unnamed: 0,index,gender,height,weight
0,1,1,184.1,78.4
1,2,0,166.1,58.7
2,3,0,163.9,61.7
3,4,0,162.6,62.8
4,5,1,169.3,84.2
...,...,...,...,...
395,396,0,168.6,55.9
396,397,0,165.7,50.0
397,398,1,170.6,83.4
398,399,0,160.9,57.5
