In [1]:
import pandas as pd

# Load the TSV file
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\name.basics.tsv', sep='\t')

# View the first few rows of the dataset
print(df.head())

# Display the columns and structure of the dataset
print(df.info())

      nconst      primaryName birthYear deathYear  \
0  nm0000001     Fred Astaire      1899      1987   
1  nm0000002    Lauren Bacall      1924      2014   
2  nm0000003  Brigitte Bardot      1934        \N   
3  nm0000004     John Belushi      1949      1982   
4  nm0000005   Ingmar Bergman      1918      2007   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0050419,tt0072308,tt0053137,tt0027125  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0117057,tt0038355  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,tt0080455,tt0078723  
4               writer,director,actor  tt0050986,tt0083922,tt0050976,tt0069467  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13886887 entries, 0 to 13886886
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst        

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Convert birthYear and deathYear to numeric (handling missing values)
df['birthYear'] = pd.to_numeric(df['birthYear'], errors='coerce')
df['deathYear'] = pd.to_numeric(df['deathYear'].replace('\\N', None), errors='coerce')

In [5]:
# Create a new column 'isAlive' where 1 means alive and 0 means deceased
df['isAlive'] = df['deathYear'].isna().astype(int)

# Handling missing values by filling NaNs in 'birthYear' with the median year
df['birthYear'].fillna(df['birthYear'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['birthYear'].fillna(df['birthYear'].median(), inplace=True)


In [7]:
# Count the frequency of each profession
profession_counts = df['primaryProfession'].value_counts()

# Get the top 10 most common professions
top_professions = profession_counts.nlargest(10).index

# Create a new column 'primaryProfession_filtered' where only top professions are kept, others are set to 'Other'
df['primaryProfession_filtered'] = df['primaryProfession'].apply(lambda x: x if x in top_professions else 'Other')

# One-hot encode the filtered profession column
df = pd.get_dummies(df, columns=['primaryProfession_filtered'], drop_first=True)

In [9]:
# Define features (X) and target (y)
X = df[['birthYear'] + [col for col in df.columns if 'primaryProfession_filtered_' in col]]  # Using birthYear and filtered profession
y = df['isAlive']  # Target is whether the person is alive

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9929189808997311
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.64      0.75     70784
           1       0.99      1.00      1.00   4095283

    accuracy                           0.99   4166067
   macro avg       0.95      0.82      0.88   4166067
weighted avg       0.99      0.99      0.99   4166067



In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

In [65]:
import pandas as pd

# Load the TSV file
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\name.basics.tsv', sep='\t')

# View the first few rows of the dataset
print(df.head())

# Display the columns and structure of the dataset
print(df.info())

      nconst      primaryName birthYear deathYear  \
0  nm0000001     Fred Astaire      1899      1987   
1  nm0000002    Lauren Bacall      1924      2014   
2  nm0000003  Brigitte Bardot      1934        \N   
3  nm0000004     John Belushi      1949      1982   
4  nm0000005   Ingmar Bergman      1918      2007   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0050419,tt0072308,tt0053137,tt0027125  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0117057,tt0038355  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,tt0080455,tt0078723  
4               writer,director,actor  tt0050986,tt0083922,tt0050976,tt0069467  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13886887 entries, 0 to 13886886
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst        

In [66]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

In [67]:
# Simulate distributed learning by splitting the data into multiple nodes
# Split the dataset into 3 parts to simulate 3 different nodes (workers)
X_node1, X_temp, y_node1, y_temp = train_test_split(X, y, test_size=0.67, random_state=42)
X_node2, X_node3, y_node2, y_node3 = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [71]:
# Train separate models on each node using RandomForestClassifier
model_node1 = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_node1, y_node1)
model_node2 = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_node2, y_node2)

In [73]:
# Simulate a backdoor attack on node 3 by flipping the labels for some entries
y_node3_poisoned = y_node3.copy()

In [75]:
# Introduce the backdoor by flipping the labels of a portion of the data (simulating a malicious attack)
flip_condition = X_node3[:, 0] > 0  # You can adjust this condition based on your dataset
y_node3_poisoned[flip_condition] = 1 - y_node3_poisoned[flip_condition]

In [77]:
# Train the poisoned model on node 3
model_node3_poisoned = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_node3, y_node3_poisoned)

In [57]:
# Simulate federated learning by averaging the feature importances (Federated Averaging)
# This is a simplified version of Federated Averaging, Random Forest models can't be averaged like linear models,
# so we average the feature importances instead to simulate a global decision.
avg_importances = (model_node1.feature_importances_ + model_node2.feature_importances_ + model_node3_poisoned.feature_importances_) / 3

In [61]:
# Aggregate the models by using the most important features
# For simplicity, we can use one of the models (e.g., model_node1) as the "global model" and adjust its feature importances
global_model = model_node1
global_model.feature_importances_ = avg_importances

AttributeError: property 'feature_importances_' of 'RandomForestClassifier' object has no setter