In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import warnings
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

# Load the data
dataset_root = 'drive/My Drive/Colab Notebooks/datasets/'
train_df = pd.read_csv(dataset_root+"boy or girl 2025 train_missingValue.csv")
test_df = pd.read_csv(dataset_root+"boy or girl 2025 test no ans_missingValue.csv")

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          423 non-null    int64  
 1   gender      423 non-null    int64  
 2   star_sign   337 non-null    object 
 3   phone_os    345 non-null    object 
 4   height      349 non-null    float64
 5   weight      338 non-null    float64
 6   sleepiness  332 non-null    float64
 7   iq          344 non-null    float64
 8   fb_friends  346 non-null    float64
 9   yt          333 non-null    object 
 10  self_intro  319 non-null    object 
dtypes: float64(5), int64(2), object(4)
memory usage: 36.5+ KB


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          426 non-null    int64  
 1   gender      426 non-null    int64  
 2   star_sign   345 non-null    object 
 3   phone_os    330 non-null    object 
 4   height      358 non-null    float64
 5   weight      330 non-null    float64
 6   sleepiness  354 non-null    float64
 7   iq          334 non-null    float64
 8   fb_friends  338 non-null    float64
 9   yt          348 non-null    float64
 10  self_intro  333 non-null    object 
dtypes: float64(6), int64(2), object(3)
memory usage: 36.7+ KB


In [None]:
train_df['phone_os'].unique()

array(['Apple', nan, 'Android', 'Windows Phone', 'JohnCena'], dtype=object)

In [None]:
train_df['weight'].unique()

array([ 4.30e+001,  4.70e+001,  6.10e+001,  6.20e+001,  6.70e+001,
              nan,  5.00e+001,  5.70e+001,  5.50e+001,  6.00e+001,
        5.30e+001,  5.20e+001,  7.50e+001,  1.80e+002,  4.80e+001,
        7.30e+001,  8.00e+001,  7.40e+001,  5.80e+001,  7.00e+001,
        6.50e+001,  6.40e+001,  1.25e+002,  2.00e+002,  8.50e+001,
        8.30e+001,  7.10e+001,  8.20e+001,  6.30e+001,  1.23e+002,
        6.80e+001,  6.60e+001,  6.90e+001,  4.90e+001,  7.20e+001,
        8.60e+001,  7.00e+000,  5.90e+001,  1.00e+111,  9.90e+001,
        1.00e+002,  5.40e+001,  4.40e+001,  7.60e+001,  4.60e+001,
        9.00e+001,  5.60e+001,  8.70e+001,  5.00e+002,  7.80e+001,
        1.20e+001,  5.10e+001,  4.50e+001, -1.00e+003,  4.00e+001,
        1.87e+002,  1.00e+001,  1.10e+002,  7.90e+001])

In [None]:
train_df.describe()

Unnamed: 0,id,gender,height,weight,sleepiness,iq,fb_friends
count,423.0,423.0,349.0,338.0,332.0,344.0,346.0
mean,212.0,1.252955,2.8653299999999997e+108,2.95858e+108,3.418675,124.723837,12515100.0
std,122.253834,0.43522,5.352877e+109,5.439283e+109,1.222652,37.683304,175582600.0
min,1.0,1.0,-187.0,-1000.0,1.0,50.0,-1000.0
25%,106.5,1.0,165.0,55.0,3.0,100.0,200.0
50%,212.0,1.0,171.0,64.5,3.0,120.0,400.0
75%,317.5,2.0,176.0,72.75,4.0,145.0,700.0
max,423.0,2.0,1e+111,1e+111,5.0,200.0,3000000000.0


In [None]:
# Function to clean outliers
def clean_outliers(df):
    # Define reasonable ranges for numerical columns
    # height_range = (100, 250)  # cm
    # weight_range = (20, 200)   # kg
    # iq_range = (50, 250)
    # fb_friends_range = (0, 10000)
    # yt_range = (0, 10000)
    columns_to_clean = ['height', 'weight', 'iq', 'fb_friends', 'yt']

    df['yt'] = pd.to_numeric(df['yt'], errors='coerce')
    for col in ['height', 'weight', 'fb_friends', 'yt']:
      df[col] = df[col].apply(lambda x: abs(x) if pd.notna(x) else x)

    # for col, (min_val, max_val) in [('height', height_range), ('weight', weight_range), ('iq', iq_range)]:
    #     df[col] = df[col].apply(lambda x: x if pd.isna(x) or (isinstance(x, (int, float)) and min_val <= x <= max_val) else np.nan)

    for col in columns_to_clean:
      Q1 = df[col].quantile(0.25)
      Q3 = df[col].quantile(0.75)
      IQR = Q3 - Q1
      lower_bound = Q1 - 1.5 * IQR
      upper_bound = Q3 + 1.5 * IQR

      # 把超過範圍的值變為 NaN
      df[col] = df[col].apply(lambda x: x if lower_bound <= x <= upper_bound else np.nan)

    return df

# Prepare features and target
features = ['star_sign', 'phone_os', 'height', 'weight', 'sleepiness',
           'iq', 'fb_friends', 'yt', 'self_intro']

# Clean outliers in both datasets
train_df = clean_outliers(train_df)
test_df = clean_outliers(test_df)

X_train = train_df[features]
y_train = train_df['gender']
X_test = test_df[features]
test_ids = test_df['id']

In [None]:
train_df.describe()

Unnamed: 0,id,gender,height,weight,sleepiness,iq,fb_friends,yt
count,423.0,423.0,336.0,325.0,332.0,344.0,318.0,277.0
mean,212.0,1.252955,170.43622,64.283077,3.418675,124.723837,430.059748,3.530176
std,122.253834,0.43522,8.064255,11.786417,1.222652,37.683304,338.127297,4.569706
min,1.0,1.0,150.0,40.0,1.0,50.0,0.0,0.0
25%,106.5,1.0,165.0,55.0,3.0,100.0,163.25,0.65
50%,212.0,1.0,171.0,64.0,3.0,120.0,350.5,1.7
75%,317.5,2.0,176.0,72.0,4.0,145.0,600.0,5.0
max,423.0,2.0,190.0,100.0,5.0,200.0,1362.0,20.7


In [None]:
# Define preprocessing for numerical and categorical columns
numeric_features = ['height', 'weight', 'sleepiness', 'iq', 'fb_friends', 'yt']
categorical_features = ['star_sign', 'phone_os']

# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', LabelEncoder())
])

# Handle self_intro separately (convert to text length)
def extract_text_length(df):
    df['self_intro_length'] = df['self_intro'].fillna('').apply(len)
    return df.drop('self_intro', axis=1)

X_train = extract_text_length(X_train)
X_test = extract_text_length(X_test)

# Update features list
features = ['star_sign', 'phone_os', 'height', 'weight', 'sleepiness',
           'iq', 'fb_friends', 'yt', 'self_intro_length']
numeric_features.append('self_intro_length')

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', 'passthrough', categorical_features)  # We'll handle categorical encoding separately
    ])

# Encode categorical variables
le_star = LabelEncoder()
le_phone = LabelEncoder()

# Combine train and test data to fit the encoder with all possible labels
combined_phone_os = pd.concat([X_train['phone_os'], X_test['phone_os']], axis=0).fillna('missing')

# Fit the encoders
X_train['star_sign'] = le_star.fit_transform(X_train['star_sign'].fillna('missing'))
le_phone.fit(combined_phone_os)
X_train['phone_os'] = le_phone.transform(X_train['phone_os'].fillna('missing'))

In [None]:
# Transform test data
X_test['star_sign'] = le_star.transform(X_test['star_sign'].fillna('missing'))
X_test['phone_os'] = le_phone.transform(X_test['phone_os'].fillna('missing'))

# Create and train the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42))])

# Split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train[features], y_train, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Fit the model
pipeline.fit(X_train_split, y_train_split)

# Validate the model
val_predictions = pipeline.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
print(f"Validation Accuracy: {f1_score(y_val, val_predictions):.4f}")

# Make predictions on test set
test_predictions = pipeline.predict(X_test[features])

# Create output DataFrame
output_df = pd.DataFrame({
    'id': test_ids,
    'gender': test_predictions
})

# Save predictions
output_df.to_csv('gender_predictions.csv', index=False)
print("Predictions saved to 'gender_predictions.csv'")

Validation Accuracy: 0.8471
Validation Accuracy: 0.9037
Predictions saved to 'gender_predictions.csv'


In [None]:
pred1 = pd.read_csv('gender_predictions-1.csv')
pred2 = pd.read_csv('gender_predictions.csv')

# 檢查兩個預測是否不同
diff = pred1['gender'] != pred2['gender']

# 將兩個資料框架平行顯示
comparison = pd.concat([pred1[diff], pred2[diff]], axis=1, keys=['pred1', 'pred2'])

# 顯示平行比較結果
print(comparison)

    pred1        pred2       
       id gender    id gender
23     24      1    24      2
42     43      1    43      2
88     89      2    89      1
176   177      2   177      1
234   235      2   235      1
250   251      2   251      1
271   272      2   272      1
300   301      1   301      2
322   323      2   323      1
349   350      2   350      1
391   392      2   392      1
