In [1]:
import numpy as np
import pandas as pd

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [2]:
train.drop(columns=['Unnamed: 0','id'],inplace=True)
test.drop(columns=['Unnamed: 0'],inplace=True)

print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156620 entries, 0 to 156619
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  156620 non-null  int64  
 1   weight(kg)           156620 non-null  int64  
 2   waist(cm)            156620 non-null  float64
 3   hearing(left)        156620 non-null  int64  
 4   hearing(right)       156620 non-null  int64  
 5   systolic             156620 non-null  int64  
 6   relaxation           156620 non-null  int64  
 7   fasting blood sugar  156620 non-null  int64  
 8   Cholesterol          156620 non-null  int64  
 9   triglyceride         156620 non-null  int64  
 10  HDL                  156620 non-null  int64  
 11  LDL                  156620 non-null  int64  
 12  hemoglobin           156620 non-null  float64
 13  Urine protein        156620 non-null  int64  
 14  serum creatinine     156620 non-null  float64
 15  AST              

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
# Load and prepare your dataset (X, y)
y=train['smoking']
X=train.drop(columns=['smoking'])


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define your FNN model
model_fnn = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Replace num_features with the number of input features
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')  # For binary classification
])

# Compile the model
model_fnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_fnn.fit(X_train, y_train, epochs=15, batch_size=64)  # Replace X_train and y_train with your data
FNN_y_pred = model_fnn.predict(X_test)
FNN_y_pred = (FNN_y_pred > 0.5).astype(int)
# Evaluate the model's performance
accuracy = accuracy_score(y_test,FNN_y_pred)
print(f"fnn Accuracy: {accuracy}")

conf_matrix = confusion_matrix(y_test, FNN_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fnn Accuracy: 0.7739752266632614
Confusion Matrix:
[[13428  4334]
 [ 2746 10816]]


In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

X_train_reshape=np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))

# Define your RNN model
model_rnn = keras.Sequential([
    layers.Input(shape=(X_train_reshape.shape[1], X_train_reshape.shape[2] )),
    layers.SimpleRNN(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # For binary classification
])

# Compile the model
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_rnn.fit(X_train, y_train, epochs=15, batch_size=64)
rnn_y_pred = model_rnn.predict(X_test)
rnn_y_pred = (rnn_y_pred > 0.5).astype(int)

# Evaluate the model's performance
accuracy = accuracy_score(y_test,rnn_y_pred)
print(f"rnn Accuracy: {accuracy}")

conf_matrix = confusion_matrix(y_test, rnn_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
rnn Accuracy: 0.7661218235219002
Confusion Matrix:
[[13329  4433]
 [ 2893 10669]]


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a Logistic Regression model
lr_model = LogisticRegression()

# Train the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the test data
lr_y_pred = lr_model.predict(X_test)

# Evaluate the model's performance using accuracy
accuracy = accuracy_score(y_test, lr_y_pred)
print(f"logistic regression Accuracy: {accuracy}")
conf_matrix = confusion_matrix(y_test, lr_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

logistic regression Accuracy: 0.7527774230621888
Confusion Matrix:
[[13389  4373]
 [ 3371 10191]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
rf_predictions = rf_classifier.predict(X_test)

# Evaluate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")
conf_matrix = confusion_matrix(y_test, rf_predictions)
print("Confusion Matrix:")
print(conf_matrix)

Random Forest Accuracy: 0.7670157068062827
Confusion Matrix:
[[13396  4366]
 [ 2932 10630]]


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Create a K-Nearest Neighbors (KNN) Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier on the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
knn_predictions = knn_classifier.predict(X_test)

# Evaluate accuracy
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Accuracy: {knn_accuracy}")
conf_matrix = confusion_matrix(y_test, knn_predictions)
print("Confusion Matrix:")
print(conf_matrix)

KNN Accuracy: 0.7273336738602988
Confusion Matrix:
[[12970  4792]
 [ 3749  9813]]


In [9]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    objective="binary:logistic",
    gamma=1,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=3,
    n_estimators=300,
    random_state=42
)

# Train the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
xgb_y_pred = xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, xgb_y_pred)
print(f"XGBoost with GPU Accuracy: {accuracy}")
conf_matrix = confusion_matrix(y_test, xgb_y_pred)
print("Confusion Matrix:")
print(conf_matrix)



XGBoost with GPU Accuracy: 0.7801047120418848
Confusion Matrix:
[[13495  4267]
 [ 2621 10941]]


In [10]:
test_data_pred=test.copy()
test_data_pred.drop(columns=['id'],inplace=True)
test.head()

test_data_pred=scaler.transform(test_data_pred)
w_pred=pd.DataFrame({
    'id': test.id,
      'xgb_smoking':xgb_model.predict(test_data_pred)})

w_pred.to_csv('xgb.csv',index=False)

In [11]:
print(test.shape)
output_fnn = model_fnn.predict(test_data_pred)

# If 'output_fnn' is not 1D, flatten it
if len(output_fnn.shape) > 1:
    output_fnn = output_fnn[:, 0] 

    
output_rnn = model_rnn.predict(test_data_pred)

# If 'output_fnn' is not 1D, flatten it
if len(output_rnn.shape) > 1:
    output_rnn = output_rnn[:, 0] 
    
    
'''   
output_lstm= model_lstm.predict(test_data_pred)

# If 'output_fnn' is not 1D, flatten it
if len(output_lstm.shape) > 1:
    output_lstm = output_lstm[:, 0] 
'''   
    
    
    
    
results_df = pd.DataFrame({
    'id': test.id,  # Assuming 'id' is the index in the test_data DataFrame
    'xgb_smoking':xgb_model.predict(test_data_pred),
    'lr_smoking':lr_model.predict(test_data_pred),
    #'dt_smoking':dt_model.predict(test_data_pred),
    'rf_smoking':rf_classifier.predict(test_data_pred),
   # 'knn_smoking':knn_classifier.predict(test_data_pred),
   # 'nb_smoking':nb_classifier.predict(test_data_pred),
    'fnn_smoking':(output_fnn >= 0.5).astype(int),
    'rnn_smoking':(output_rnn >= 0.5).astype(int),
  #  'lstm_smoking':(output_lstm >= 0.5).astype(int),
    

})

# Set the 'id' column as the index
#results_df.set_index('id', inplace=True)
#results_df.set_index('id', inplace=True)

# Now, 'results_df' will have 'id' as the index, and it contains the predicted values
print(results_df.info())

# Save the DataFrame to a CSV file
results_df.to_csv('submission.csv',index=False)

(106171, 28)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106171 entries, 0 to 106170
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   id           106171 non-null  int64
 1   xgb_smoking  106171 non-null  int32
 2   lr_smoking   106171 non-null  int64
 3   rf_smoking   106171 non-null  int64
 4   fnn_smoking  106171 non-null  int32
 5   rnn_smoking  106171 non-null  int32
dtypes: int32(3), int64(3)
memory usage: 3.6 MB
None


In [12]:
import pandas as pd
final=results_df.copy()
# Assuming df is your DataFrame
final['avg_prediction'] = (final.iloc[:, 1:] >= 0.5).mean(axis=1).round().astype(int)

# Drop the individual prediction columns if needed
final.drop(columns=['xgb_smoking', 'lr_smoking', 'rf_smoking',
                        'fnn_smoking','rnn_smoking'], inplace=True)#'nb_smoking',,'lstm_smoking' 'knn_smoking', 

# Now, df will have a new column 'avg_prediction' containing the average based on your condition


print(final.info())

# Save the DataFrame to a CSV file
final.to_csv('single_sub.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106171 entries, 0 to 106170
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   id              106171 non-null  int64
 1   avg_prediction  106171 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 1.2 MB
None
