    KNN

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# Step 1: Generate synthetic dataset
data = {
    'Experience': [5, 8, 3, 10, 2, 7],
    'Written_Score': [8, 7, 6, 9, 5, 8],
    'Interview_Score': [10, 6, 7, 8, 9, 5],
    'Salary': [60000, 80000, 45000, 90000, 35000, 75000]
}

df = pd.DataFrame(data)

# Step 2: Save dataset to a .csv file
df.to_csv('candidates_dataset.csv', index=False)

# Step 3: Load dataset
df = pd.read_csv('candidates_dataset.csv')

# Step 4: Split dataset into features and target
X = df.drop('Salary', axis=1)
y = df['Salary']

# Step 5: Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Standardize features (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Build KNN model
knn_model = KNeighborsRegressor(n_neighbors=3)  # Specify the value of K
# Step 8: Train the model
knn_model.fit(X_train_scaled, y_train)

# Step 9: Make predictions on the testing set
y_pred = knn_model.predict(X_test_scaled)

# Step 10: Use the trained model to predict salaries for new candidates
new_candidates = pd.DataFrame({
    'Experience': [5, 8],
    'Written_Score': [8, 7],
    'Interview_Score': [10, 6]
})

# Standardize the new candidate data
new_candidates_scaled = scaler.transform(new_candidates)

# Predict salaries for new candidates
predicted_salaries = knn_model.predict(new_candidates_scaled)
print("Predicted salaries for new candidates:")
for i, salary in enumerate(predicted_salaries):
    print(f"Candidate {i+1}: ${salary:.2f}")


Predicted salaries for new candidates:
Candidate 1: $56666.67
Candidate 2: $70000.00


        Decision tree IRIS model

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the IRIS dataset
iris = load_iris()
X = iris.data
y = iris.target

# Define function to evaluate model performance
def evaluate_model(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(criterion='entropy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Vary percentage of training data
percentages = [0.6, 0.7, 0.8, 0.9]
for percentage in percentages:
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=percentage, random_state=42)
    accuracy, precision, recall, f1 = evaluate_model(X_train, X_test, y_train, y_test)
    print(f"Percentage of training data: {percentage}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    print()

# Explore effect of other decision tree parameters
parameters = {'max_depth': [None, 3, 5, 10], 'min_samples_split': [2, 5, 10]}
for max_depth in parameters['max_depth']:
    for min_samples_split in parameters['min_samples_split']:
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f"Max Depth: {max_depth}, Min Samples Split: {min_samples_split}")
        print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
        print()


Percentage of training data: 0.6
Accuracy: 0.98, Precision: 0.98, Recall: 0.98, F1 Score: 0.98

Percentage of training data: 0.7
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Percentage of training data: 0.8
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Percentage of training data: 0.9
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: None, Min Samples Split: 2
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: None, Min Samples Split: 5
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: None, Min Samples Split: 10
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: 3, Min Samples Split: 2
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: 3, Min Samples Split: 5
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: 3, Min Samples Split: 10
Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00

Max Depth: 5, Min

    everything together

In [1]:
#data set generator
import pandas as pd
import numpy as np

# Generate random data
np.random.seed(42)
data = pd.DataFrame(np.random.rand(100, 5), columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'])

# Generate random target class (0 or 1)
data['TARGET CLASS'] = np.random.randint(0, 2, size=len(data))

# Save data to CSV file
data.to_csv('classified_data.csv', index=False)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Classified Data
classified_data = pd.read_csv('classified_data.csv')

# Prepare data
X = classified_data.drop('TARGET CLASS', axis=1)
y = classified_data['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define functions to evaluate models
def evaluate_knn(n_neighbors):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

def evaluate_nb():
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

def evaluate_decision_tree(max_depth, min_samples_split):
    clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Vary parameters for each model
knn_parameters = [1, 3, 5, 7, 9]
nb_parameters = []
decision_tree_parameters = [{'max_depth': None, 'min_samples_split': 2},
                            {'max_depth': 5, 'min_samples_split': 2},
                            {'max_depth': 10, 'min_samples_split': 2}]

# Evaluate models and save results to CSV
results = []
for n_neighbors in knn_parameters:
    accuracy, precision, recall, f1 = evaluate_knn(n_neighbors)
    results.append(['KNN', n_neighbors, accuracy, precision, recall, f1])

accuracy, precision, recall, f1 = evaluate_nb()
results.append(['Naive Bayes', 'N/A', accuracy, precision, recall, f1])

for params in decision_tree_parameters:
    accuracy, precision, recall, f1 = evaluate_decision_tree(params['max_depth'], params['min_samples_split'])
    results.append(['Decision Tree', f"Max Depth: {params['max_depth']}, Min Samples Split: {params['min_samples_split']}", accuracy, precision, recall, f1])

# Create DataFrame and save to CSV
df = pd.DataFrame(results, columns=['Model', 'Parameters', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
df.to_csv('model_comparison_results.csv', index=False)
