## Machine Learning with Statistics

### Question 1: Train-Test Split
#### Task: Split a dataset into training and testing sets.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Sample DataFrame
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# Features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)
print(X_test)
print(y_train)
print(y_test)


### Question 2: Logistic Regression
#### Task: Train a logistic regression model and make predictions.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Sample DataFrame
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# Features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)
print(predictions)


### Question 3: Cross-Validation
#### Task: Perform k-fold cross-validation on a decision tree classifier.

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Sample DataFrame
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# Features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Decision tree classifier
model = DecisionTreeClassifier()

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv)

# Print each score
for fold_index, score in enumerate(cv_scores, start=1):
    print(f"Fold {fold_index} score: {score}")

# Print mean and standard deviation of scores
print(f"Mean CV Score: {cv_scores.mean()}")
print(f"Standard Deviation of CV Scores: {cv_scores.std()}")


### Question 4: Hyperparameter Tuning with Grid Search
#### Task: Perform hyperparameter tuning using GridSearchCV on a random forest classifier.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Sample DataFrame
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# Features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Random forest classifier
model = RandomForestClassifier()

# Hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20]
}

# Grid search
grid_search = GridSearchCV(model, param_grid, cv=3)
grid_search.fit(X, y)

# Best parameters
print(grid_search.best_params_)


### Question 5: Evaluation Metrics
#### Task: Evaluate a model using accuracy, precision, recall, and F1 score.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Sample DataFrame
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# Features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
