In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import os

def clone_repo(repo_url):
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    if not os.path.exists(repo_name):
        print(f"Cloning repository from {repo_url}...")
        os.system(f"git clone {repo_url}")
    else:
        print(f"Repository {repo_name} already cloned.")
    return repo_name

# Clone the Penguins classification repository
repo_url = "https://github.com/ine-rmotr-projects/classifying-penguins-with-machine-learning.git"
repo_name = clone_repo(repo_url)

# Load the dataset
data_file = os.path.join(repo_name, "penguins.csv")
if not os.path.exists(data_file):
    raise FileNotFoundError(f"Data file not found: {data_file}")

data = pd.read_csv(data_file)

# Preview the dataset
print("Preview of the dataset:")
print(data.head())

# Basic information
print("\nDataset information:")
print(data.info())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Drop rows with missing values
data = data.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
data['species'] = label_encoder.fit_transform(data['species'])

data = pd.get_dummies(data, columns=['sex'], drop_first=True)

# Feature selection
X = data.drop(['species', 'island'], axis=1)  # Dropping 'island' as it is categorical and may not contribute significantly
y = data['species']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Visualize feature importance
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10, 6), title="Feature Importances")
plt.show()

# Save the cleaned dataset (optional)
output_file = os.path.join(repo_name, "cleaned_penguins.csv")
data.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")




Cloning repository from https://github.com/ine-rmotr-projects/RDP-health-and-obesity-trends.git...


FileNotFoundError: Data file not found: RDP-health-and-obesity-trends/data/obesity-cleaned.csv