In [27]:
import pandas as pd

def impute_missing_values(nums):
    # Convert input list to Pandas Series
    s = pd.Series(nums)
    
    # Fill missing values (NaN) with mean rounded to two decimals
    s.fillna(round(s.mean(), 2), inplace=True)
    
    # Convert Series back to list
    return s.tolist()

# Test example
nums = [3.5, None, 4.2, 6.0, None]
print(impute_missing_values(nums))


[3.5, 4.57, 4.2, 6.0, 4.57]


In [28]:
import pandas as pd

def filter_scores(data, threshold=50):
    # Convert list of dictionaries into DataFrame
    df = pd.DataFrame(data)
    
    # Filter rows where 'score' >= threshold
    filtered_df = df[df['score'] >= threshold]
    
    # Convert filtered DataFrame back to a list of dictionaries
    return filtered_df.to_dict(orient='records')

# Example usage:
data = [{"id": 1, "score": 85}, {"id": 2, "score": 45}, {"id": 3, "score": 90}]
print(filter_scores(data, 50))


[{'id': 1, 'score': 85}, {'id': 3, 'score': 90}]


In [29]:
import pandas as pd

def aggregate_sales(data):
    # Convert input list of dictionaries to DataFrame
    df = pd.DataFrame(data)

    # Group by 'region' and sum the 'amount'
    grouped = df.groupby('region', as_index=True)['amount'].sum()

    # Convert the grouped result into a dictionary
    return grouped.to_dict()

# Example usage:
sales_data = [
    {"region": "North", "amount": 10000},
    {"region": "South", "amount": 5000},
    {"region": "North", "amount": 15000},
    {"region": "South", "amount": 13000}
]

print(aggregate_sales(sales_data))


{'North': 25000, 'South': 18000}


In [30]:
import pandas as pd

def sort_students_by_grade(data, ascending=False):
    # Convert list of dictionaries into DataFrame
    df = pd.DataFrame(data)
    
    # Sort DataFrame by 'grade', ascending or descending
    df_sorted = df.sort_values(by='grade', ascending=ascending)
    
    # Convert sorted DataFrame back to list of formatted strings
    sorted_list = [f"{row['name']}: {row['grade']}" for _, row in df_sorted.iterrows()]
    
    return sorted_list

# Example usage:
students = [
    {"name": "Alice", "grade": 80},
    {"name": "Bob", "grade": 95},
    {"name": "Charlie", "grade": 70}
]

# Sort by descending grade (highest first)
print(sort_students_by_grade(students, ascending=False))


['Bob: 95', 'Alice: 80', 'Charlie: 70']


In [31]:
import pandas as pd

def merge_datasets(customers, orders):
    # Convert input lists into DataFrames
    df_customers = pd.DataFrame(customers)
    df_orders = pd.DataFrame(orders)
    
    # Merge DataFrames on 'cust_id'
    merged_df = pd.merge(df_customers, df_orders, on='cust_id')
    
    # Convert merged DataFrame back to list of dictionaries
    return merged_df.to_dict(orient='records')

# Example usage:
customers = [
    {"cust_id": 1, "name": "Alice"},
    {"cust_id": 2, "name": "Bob"}
]

orders = [
    {"order_id": 101, "cust_id": 2, "total": 300},
    {"order_id": 102, "cust_id": 1, "total": 150}
]

print(merge_datasets(customers, orders))


[{'cust_id': 1, 'name': 'Alice', 'order_id': 102, 'total': 150}, {'cust_id': 2, 'name': 'Bob', 'order_id': 101, 'total': 300}]


In [32]:
import pandas as pd
import numpy as np


def one_hot_encode(categories):
    # Convert categories to pandas Series
    s = pd.Series(categories)
    
    # Perform one-hot encoding
    encoded_df = pd.get_dummies(s).astype(int)

    
    # Convert DataFrame to a list of lists
    return encoded_df.values.tolist()

# Example usage:
categories = ["red", "blue", "red", "green"]
print(one_hot_encode(categories))




import pandas as pd

def label_encode(categories):
    # Convert categories to pandas Series and then factorize
    s = pd.Series(categories)
    
    # Perform label encoding (factorize returns tuple: codes, uniques)
    labels, uniques = pd.factorize(s)
    
    # Convert labels to list
    return labels.tolist()

# Example usage:
categories = ["red", "blue", "red", "green"]
print(label_encode(categories))



[[0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 1, 0]]
[0, 1, 0, 2]


In [33]:
import pandas as pd

def min_max_scale(values):
    # Convert values into a pandas Series
    s = pd.Series(values)
    
    # Perform Min-Max scaling
    scaled = (s - s.min()) / (s.max() - s.min())
    
    # Round scaled values to two decimal places and convert to list
    return scaled.round(2).tolist()

# Example usage:
values = [5, 15, 20, 10]
print(min_max_scale(values))


[0.0, 0.67, 1.0, 0.33]


In [34]:
import pandas as pd

def bin_ages(ages):
    # Define explicit bin edges and labels
    bins = [float('-inf'), 17, 35, 50, float('inf')]
    labels = ["<18", "18-35", "36-50", "51+"]

    # Perform binning using pd.cut()
    binned_ages = pd.cut(ages, bins=bins, labels=labels)

    # Convert to list of strings
    return binned_ages.astype(str).tolist()

# Unordered example usage:
ages = [48, 22, 15, 37, 35, 52, 8, 19, 67, 34]
print(bin_ages(ages))


['36-50', '18-35', '<18', '36-50', '18-35', '51+', '<18', '18-35', '51+', '18-35']


In [43]:
import numpy as np

def detect_outliers_iqr(data):
    Q1 = np.percentile(data, 25)
    print(Q1)
    Q3 = np.percentile(data, 75)
    print(Q3)
    IQR = Q3 - Q1
    print(IQR)
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = [x for x in data if x < lower_bound or x > upper_bound]
    
    return outliers

# Sample Input
data = [2, 5, 4, 100, 6, 3]

# Detect outliers
print(detect_outliers_iqr(data))




3.25
5.75
2.5
[100]


In [36]:
from sklearn.linear_model import LinearRegression
import numpy as np

def predict_new_x(X, y, new_x):
    # Convert data into numpy arrays and reshape
    X = np.array(X).reshape(-1, 1)
    y = np.array(y)

    # Initialize and train linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predict the y-value for new_x
    prediction = model.predict(np.array([[new_x]]))

    # Return the prediction as a single float (rounded to 2 decimals)
    return round(prediction[0], 2)

# Example usage:
X = [1, 2, 3, 4, 5]
y = [2, 4, 5, 4, 5]

new_x = 6
print(predict_new_x(X, y, new_x))


5.8


In [44]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Multi-dimensional training data
X = np.array([
    [1, 2],
    [2, 3],
    [3, 4],
    [4, 5],
    [5, 6]
])

y = np.array([9, 13, 17, 21, 25])

# Train the model
model = LinearRegression()
model.fit(X, y)

# New multi-dimensional point for prediction
new_X = np.array([[6, 7]])
prediction = model.predict(new_X)

print(f"Predicted value for input {new_X[0]} is {prediction[0]}")


Predicted value for input [6 7] is 29.0


In [45]:
import pandas as pd
import numpy as np
 
data = pd.DataFrame({
    'PassengerId': np.arange(1, 11),
    'Pclass': [3, 1, 3, 1, 3, 3, 1, 3, 3, 2],
    'Sex': [0, 1, 1, 1, 0, 0, 0, 0, 1, 1],
    'Age': [22, 38, 26, 35, 35, 29, 54, 2, 27, 14],
    'Survived': [0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
})

shuffled_data = data.sample(frac=1, random_state=42).reset_index(drop=True)

train_size = int(0.3 * len(shuffled_data))

train_set = shuffled_data.iloc[:train_size]
test_set = shuffled_data.iloc[train_size:]

print("🔷 Training Set (30%):")
print(train_set)

print("\n🔶 Test Set (70%):")
print(test_set)



🔷 Training Set (30%):
   PassengerId  Pclass  Sex  Age  Survived
0            9       3    1   27         1
1            2       1    1   38         1
2            6       3    0   29         0

🔶 Test Set (70%):
   PassengerId  Pclass  Sex  Age  Survived
3            1       3    0   22         0
4            8       3    0    2         1
5            3       3    1   26         1
6           10       2    1   14         1
7            5       3    0   35         0
8            4       1    1   35         1
9            7       1    0   54         0


In [37]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Dataset clearly defined
data = {
    'age': [25, 30, 22, 35, 28, 42, 50, 27, 31, 29],
    'annual_income': [40000, 60000, 32000, 70000, 48000, 90000, 110000, 55000, 64000, 51000],
    'credit_score': [650, 720, 600, 710, 680, 750, 780, 690, 705, 665],
    'num_previous_purchases': [1, 4, 0, 5, 2, 8, 9, 3, 4, 2],
    'buy_product': [0, 1, 0, 1, 0, 1, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

# Feature and target split
X = df[['age', 'annual_income', 'credit_score', 'num_previous_purchases']]
y = df['buy_product']

# Train/test split clearly defined
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Model clearly trained
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predict and evaluate clearly
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Fix warning by adding zero_division=0
print("Test Accuracy:", round(accuracy, 2))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# New customer (fixed with proper DataFrame clearly)
new_customer = pd.DataFrame(
    [[33, 75000, 730, 6]], 
    columns=['age', 'annual_income', 'credit_score', 'num_previous_purchases']
)

predicted_class = classifier.predict(new_customer)[0]
predicted_prob = classifier.predict_proba(new_customer)[0]

print(f"New customer prediction (0=No, 1=Yes): {predicted_class}")
print(f"Prediction Probability: [Not buy: {predicted_prob[0]:.2f}, Buy: {predicted_prob[1]:.2f}]")



Test Accuracy: 0.33

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.33      0.50         3

    accuracy                           0.33         3
   macro avg       0.50      0.17      0.25         3
weighted avg       1.00      0.33      0.50         3

New customer prediction (0=No, 1=Yes): 1
Prediction Probability: [Not buy: 0.29, Buy: 0.71]


In [38]:
# Import libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Create dataset clearly
data = {
    'hours_studied': [2, 10, 4, 7, 8, 1, 9, 5, 3, 6],
    'attendance_rate': [50, 95, 70, 85, 90, 40, 92, 75, 65, 80],
    'homework_completion': [60, 100, 80, 90, 95, 50, 100, 85, 70, 88],
    'passed_exam': [0, 1, 0, 1, 1, 0, 1, 1, 0, 1]  # target: 1=passed, 0=failed
}

df = pd.DataFrame(data)

# Step 2: Define Features (X) and Target (y)
X = df[['hours_studied', 'attendance_rate', 'homework_completion']]
y = df['passed_exam']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 4: Train Decision Tree Classifier clearly
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Step 5: Make predictions and evaluate clearly
y_pred = classifier.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

# Step 6: Predict for new student clearly
# Clearly define new student as DataFrame matching original feature names
new_student = pd.DataFrame([[6, 80, 85]], 
                           columns=['hours_studied', 'attendance_rate', 'homework_completion'])

# Predict clearly
predicted_class = classifier.predict(new_student)[0]
predicted_prob = classifier.predict_proba(new_student)[0]

print(f"Prediction for new student (0=Fail, 1=Pass): {predicted_class}")
print(f"Prediction Probability: [Fail: {predicted_prob[0]:.2f}, Pass: {predicted_prob[1]:.2f}]")


Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Prediction for new student (0=Fail, 1=Pass): 1
Prediction Probability: [Fail: 0.00, Pass: 1.00]
