In [1]:
# Scikit-Learn: Machine Learning Cheat Sheet

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

# 1. Data Preparation
data = pd.DataFrame({
    'Feature1': [1, 2, np.nan, 4, 5],
    'Feature2': ['A', 'B', 'A', 'B', 'A'],
    'Target': [0, 1, 0, 1, 1]
})

# Handling Missing Values
imputer = SimpleImputer(strategy='mean')  # Replace NaN with mean
data['Feature1'] = imputer.fit_transform(data[['Feature1']])

# Encoding Categorical Variables
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(data[['Feature2']])
data = data.drop(columns=['Feature2'])
data[['Feature2_B']] = encoded_features

# Splitting Data
X = data.drop(columns=['Target'])
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Linear Regression (For Continuous Targets)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))

# 4. Logistic Regression (For Classification)
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
y_pred = classifier.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

# 5. Decision Trees & Random Forest
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_train, y_train)

# 6. Support Vector Machine (SVM)
svm = SVC(kernel='rbf', C=1.0)
svm.fit(X_train_scaled, y_train)

# 7. Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
gb.fit(X_train, y_train)

# 8. Model Evaluation
y_pred_rf = forest.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# 9. Cross-Validation
scores = cross_val_score(RandomForestClassifier(), X, y, cv=5)
print("Cross-validation scores:", scores)

# 10. Clustering (K-Means)
X_cluster = np.random.rand(100, 2)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_cluster)
labels = kmeans.labels_
print("Cluster Centers:\n", kmeans.cluster_centers_)


ModuleNotFoundError: No module named 'sklearn'