# Objective
## Build a system that can predict if a Thyroid Cancer survivor can relapse(his or her cancer reoccurs)
### Dataset
#### This dataset contains data about thyroid checkups for people with a diagnosis and is a comprehensive collection of patient information, specifically focused on individuals diagnosed with cancer

## Step-1: Common virtual environment was created and activated: myenv
#### pip install virtualenv
#### virtualenv myenv
#### .\myenv\Scripts\activate.ps1

## Installing required libraries

In [1]:
# %pip install -r requirements.txt

## Step-2: Importing required libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import zipfile
import warnings
warnings.filterwarnings("ignore")

import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

## Step-3: Data extraction from zipfile

In [4]:
def extractingZipFile(zipFilePath, extractTo):
    """
    Extracts the contents of a zip file to a specified directory.
    
    Parameters:
    zipFilePath (str): The path to the zip file.
    extractTo (str): The directory to extract the contents to.
    """
    with zipfile.ZipFile(zipFilePath, 'r') as zip_ref:
        zip_ref.extractall(extractTo)
extractingZipFile('thyroid_cancer.zip', 'data')        

## Step-4: Importing data into a dataframe

In [13]:
def readingData(path):
    """
    Reads the data from a CSV file and returns it as a pandas DataFrame.
    Parameters:
    path (str): The path to the CSV file.
    Returns:
    pd.DataFrame: The data as a pandas DataFrame.
    """
    df = pd.read_csv(path)
    return df
df=readingData("data/thyroid_cancer/dataset.csv")
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


## Step-4: EDA (Exploratory Data Analysis)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [15]:
df.describe()

Unnamed: 0,Age
count,383.0
mean,40.866841
std,15.134494
min,15.0
25%,29.0
50%,37.0
75%,51.0
max,82.0


## Step-4(a): Checking missing values

In [16]:
def checkMissingValues(df):
    """
    Checks for missing values in the DataFrame 
    Parameters:
    df (pd.DataFrame): The DataFrame to check for missing values.
    Returns:
    missing values
    """
    return df.isnull().sum()

missing_values = checkMissingValues(df)
missing_values

Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64

#### No missing values were found

## Step-4(b): Removing duplicates

In [17]:
## function to check for duplicates and remove dupliates
def checkDuplicates(df):
    """
    Checks for duplicate rows in the DataFrame and removes them.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to check for duplicates.
    
    Returns:
    pd.DataFrame: The DataFrame with duplicates removed.
    """
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        df = df.drop_duplicates()
        print(f"Removed {duplicates} duplicate rows.")
    else:
        print("No duplicate rows found.")
    return df
df = checkDuplicates(df)
df.head()

Removed 19 duplicate rows.


Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [18]:
## function to check categorical columns and replacing them with numerical values
def checkCategoricalColumns(df):
    """
    Checks for categorical columns in the DataFrame and replaces them with numerical values.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to check for categorical columns.
    
    Returns:
    pd.DataFrame: The DataFrame with categorical columns replaced with numerical values.
    """
    categorical_columns = df.select_dtypes(include=['object']).columns
    print(f"Categorical columns: {categorical_columns}")

    for col in categorical_columns:
        print(f"col.unique(): {df[col].unique()}")
        print(f"col.value_counts(): {df[col].value_counts()}")
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df
df = checkCategoricalColumns(df)
df.head()

Categorical columns: Index(['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')
col.unique(): ['F' 'M']
col.value_counts(): Gender
F    293
M     71
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Smoking
No     315
Yes     49
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Hx Smoking
No     336
Yes     28
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Hx Radiothreapy
No     357
Yes      7
Name: count, dtype: int64
col.unique(): ['Euthyroid' 'Clinical Hyperthyroidism' 'Clinical Hypothyroidism'
 'Subclinical Hyperthyroidism' 'Subclinical Hypothyroidism']
col.value_counts(): Thyroid Function
Euthyroid                      313
Clinical Hyperthyroidism        20
Subclinical Hypothyroidism      14
Clinical Hypothyroidism         12
Subclinic

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,0,0,0,0,2,3,3,2,1,2,0,0,0,0,2,0
1,34,0,0,1,0,2,1,3,2,1,2,0,0,0,0,1,0
2,30,0,0,0,0,2,4,3,2,1,2,0,0,0,0,1,0
3,62,0,0,0,0,2,4,3,2,1,2,0,0,0,0,1,0
4,62,0,0,0,0,2,1,3,2,0,2,0,0,0,0,1,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 364 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Age                   364 non-null    int64
 1   Gender                364 non-null    int64
 2   Smoking               364 non-null    int64
 3   Hx Smoking            364 non-null    int64
 4   Hx Radiothreapy       364 non-null    int64
 5   Thyroid Function      364 non-null    int64
 6   Physical Examination  364 non-null    int64
 7   Adenopathy            364 non-null    int64
 8   Pathology             364 non-null    int64
 9   Focality              364 non-null    int64
 10  Risk                  364 non-null    int64
 11  T                     364 non-null    int64
 12  N                     364 non-null    int64
 13  M                     364 non-null    int64
 14  Stage                 364 non-null    int64
 15  Response              364 non-null    int64
 16  Recurred     

In [20]:
df.corr()["Recurred"].sort_values(ascending=False)

Recurred                1.000000
Response                0.702864
N                       0.624538
T                       0.553318
Stage                   0.444569
M                       0.351161
Smoking                 0.325300
Gender                  0.317731
Age                     0.248926
Hx Radiothreapy         0.171796
Hx Smoking              0.128472
Thyroid Function        0.071077
Pathology               0.014230
Physical Examination   -0.122076
Adenopathy             -0.182313
Focality               -0.368581
Risk                   -0.727404
Name: Recurred, dtype: float64

## Step-5: model building

In [21]:
X=df.drop(columns=['Recurred'])
y=df['Recurred']
X.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response
0,27,0,0,0,0,2,3,3,2,1,2,0,0,0,0,2
1,34,0,0,1,0,2,1,3,2,1,2,0,0,0,0,1
2,30,0,0,0,0,2,4,3,2,1,2,0,0,0,0,1
3,62,0,0,0,0,2,4,3,2,1,2,0,0,0,0,1
4,62,0,0,0,0,2,1,3,2,0,2,0,0,0,0,1


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((291, 16), (73, 16), (291,), (73,))

In [None]:
## different models training using gridserachCV and evaluation
def train_and_evaluate_model(model, param_grid, X_train, y_train, X_test, y_test):
    """
    Trains and evaluates a machine learning model using GridSearchCV.
    
    Parameters:
    model (sklearn.base.BaseEstimator): The machine learning model to train.
    param_grid (dict): The parameter grid for GridSearchCV.
    X_train (pd.DataFrame): The training data.
    y_train (pd.Series): The training labels.
    X_test (pd.DataFrame): The testing data.
    y_test (pd.Series): The testing labels.
    """
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return best_model
# Logistic Calssifier
logistic_model = LogisticRegressionCV(max_iter=1000)
logistic_param_grid = {
    'Cs': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
logistic_best_model = train_and_evaluate_model(logistic_model, logistic_param_grid, X_train, y_train, X_test, y_test)
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_best_model = train_and_evaluate_model(rf_model, rf_param_grid, X_train, y_train, X_test, y_test)
# XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
xgb_best_model = train_and_evaluate_model(xgb_model, xgb_param_grid, X_train, y_train, X_test, y_test)
# Save the best model
def save_model(model, model_name):
    """
    Saves the trained model to a file.
    
    Parameters:
    model (sklearn.base.BaseEstimator): The trained model to save.
    model_name (str): The name of the model file.
    """
    joblib.dump(model, model_name)

save_model(logistic_best_model, 'logistic_model.pkl')
save_model(rf_best_model, 'rf_model.pkl')
save_model(xgb_best_model, 'xgb_model.pkl')

## function to print the model accuracy
def print_model_accuracy(model, X_test, y_test):
    """
    Prints the accuracy of the model on the test data.
    
    Parameters:
    model (sklearn.base.BaseEstimator): The trained model to evaluate.
    X_test (pd.DataFrame): The testing data.
    y_test (pd.Series): The testing labels.
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy}")




Best parameters: {'Cs': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.8904109589041096
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        51
           1       0.82      0.82      0.82        22

    accuracy                           0.89        73
   macro avg       0.87      0.87      0.87        73
weighted avg       0.89      0.89      0.89        73

[[47  4]
 [ 4 18]]
Best parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Accuracy: 0.958904109589041
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        51
           1       0.95      0.91      0.93        22

    accuracy                           0.96        73
   macro avg       0.96      0.94      0.95        73
weighted avg       0.96      0.96      0.96        73

[[50  1]
 [ 2 20]]
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 

TypeError: print_model_accuracy() takes 3 positional arguments but 4 were given

In [27]:
print("LogisticRegressionCV_best_model: ")
print_model_accuracy(logistic_best_model, X_test, y_test)
print("RandomForestClassifier_best_model: ")
print_model_accuracy(rf_best_model, X_test, y_test)
print("XGBClassifier_best_model: ")
print_model_accuracy(xgb_best_model, X_test, y_test)

LogisticRegressionCV_best_model: 
Model accuracy: 0.8904109589041096
RandomForestClassifier_best_model: 
Model accuracy: 0.958904109589041
XGBClassifier_best_model: 
Model accuracy: 0.9452054794520548


In [None]:
# loading the best model and checking precision,recall,f1-score, accuracy
def load_model(model_name):
    """
    Loads a trained model from a file.
    
    Parameters:
    model_name (str): The name of the model file.
    
    Returns:
    sklearn.base.BaseEstimator: The loaded model.
    """
    return joblib.load(model_name)
logistic_model = load_model('logistic_model.pkl')
rf_model = load_model('rf_model.pkl')
xgb_model = load_model('xgb_model.pkl')


# Step-6:  RandomForestClassifier has maximum
Accuracy: 0.958904109589041
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        51
           1       0.95      0.91      0.93        22

    accuracy                           0.96        73
   macro avg       0.96      0.94      0.95        73
weighted avg       0.96      0.96      0.96        73


<!-- Categorical columns: Index(['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')
col.unique(): ['F' 'M']
col.value_counts(): Gender
F    293
M     71
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Smoking
No     315
Yes     49
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Hx Smoking
No     336
Yes     28
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Hx Radiothreapy
No     357
Yes      7
Name: count, dtype: int64
col.unique(): ['Euthyroid' 'Clinical Hyperthyroidism' 'Clinical Hypothyroidism'
 'Subclinical Hyperthyroidism' 'Subclinical Hypothyroidism']
col.value_counts(): Thyroid Function
Euthyroid                      313
Clinical Hyperthyroidism        20
Subclinical Hypothyroidism      14
Clinical Hypothyroidism         12
Subclinical Hyperthyroidism      5
Name: count, dtype: int64
col.unique(): ['Single nodular goiter-left' 'Multinodular goiter'
 'Single nodular goiter-right' 'Normal' 'Diffuse goiter']
col.value_counts(): Physical Examination
Multinodular goiter            135
Single nodular goiter-right    127
Single nodular goiter-left      88
Normal                           7
Diffuse goiter                   7
Name: count, dtype: int64
col.unique(): ['No' 'Right' 'Extensive' 'Left' 'Bilateral' 'Posterior']
col.value_counts(): Adenopathy
No           258
Right         48
Bilateral     32
Left          17
Extensive      7
Posterior      2
Name: count, dtype: int64
col.unique(): ['Micropapillary' 'Papillary' 'Follicular' 'Hurthel cell']
col.value_counts(): Pathology
Papillary         271
Micropapillary     45
Follicular         28
Hurthel cell       20
Name: count, dtype: int64
col.unique(): ['Uni-Focal' 'Multi-Focal']
col.value_counts(): Focality
Uni-Focal      228
Multi-Focal    136
Name: count, dtype: int64
col.unique(): ['Low' 'Intermediate' 'High']
col.value_counts(): Risk
Low             230
Intermediate    102
High             32
Name: count, dtype: int64
col.unique(): ['T1a' 'T1b' 'T2' 'T3a' 'T3b' 'T4a' 'T4b']
col.value_counts(): T
T2     138
T3a     96
T1a     46
T1b     40
T4a     20
T3b     16
T4b      8
Name: count, dtype: int64
col.unique(): ['N0' 'N1b' 'N1a']
col.value_counts(): N
N0     249
N1b     93
N1a     22
Name: count, dtype: int64
col.unique(): ['M0' 'M1']
col.value_counts(): M
M0    346
M1     18
Name: count, dtype: int64
col.unique(): ['I' 'II' 'IVB' 'III' 'IVA']
col.value_counts(): Stage
I      314
II      32
IVB     11
III      4
IVA      3
Name: count, dtype: int64
col.unique(): ['Indeterminate' 'Excellent' 'Structural Incomplete'
 'Biochemical Incomplete']
col.value_counts(): Response
Excellent                 189
Structural Incomplete      91
Indeterminate              61
Biochemical Incomplete     23
Name: count, dtype: int64
col.unique(): ['No' 'Yes']
col.value_counts(): Recurred
No     256
Yes    108
Name: count, dtype: int64 -->

In [None]:
# Categorical columns: Index(['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
#        'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
#        'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
#       dtype='object')
# col.unique(): ['F' 'M']
# col.value_counts(): Gender
# F    293
# M     71
# Name: count, dtype: int64
# col.unique(): ['No' 'Yes']
# col.value_counts(): Smoking
# No     315
# Yes     49
# Name: count, dtype: int64
# col.unique(): ['No' 'Yes']
# col.value_counts(): Hx Smoking
# No     336
# Yes     28
# Name: count, dtype: int64
# col.unique(): ['No' 'Yes']
# col.value_counts(): Hx Radiothreapy
# No     357
# Yes      7
# Name: count, dtype: int64
# col.unique(): ['Euthyroid' 'Clinical Hyperthyroidism' 'Clinical Hypothyroidism'
#  'Subclinical Hyperthyroidism' 'Subclinical Hypothyroidism']
# col.value_counts(): Thyroid Function
# Euthyroid                      313
# Clinical Hyperthyroidism        20
# Subclinical Hypothyroidism      14
# Clinical Hypothyroidism         12
# Subclinical Hyperthyroidism      5
# Name: count, dtype: int64
# col.unique(): ['Single nodular goiter-left' 'Multinodular goiter'
#  'Single nodular goiter-right' 'Normal' 'Diffuse goiter']
# col.value_counts(): Physical Examination
# Multinodular goiter            135
# Single nodular goiter-right    127
# Single nodular goiter-left      88
# Normal                           7
# Diffuse goiter                   7
# Name: count, dtype: int64
# col.unique(): ['No' 'Right' 'Extensive' 'Left' 'Bilateral' 'Posterior']
# col.value_counts(): Adenopathy
# No           258
# Right         48
# Bilateral     32
# Left          17
# Extensive      7
# Posterior      2
# Name: count, dtype: int64
# col.unique(): ['Micropapillary' 'Papillary' 'Follicular' 'Hurthel cell']
# col.value_counts(): Pathology
# Papillary         271
# Micropapillary     45
# Follicular         28
# Hurthel cell       20
# Name: count, dtype: int64
# col.unique(): ['Uni-Focal' 'Multi-Focal']
# col.value_counts(): Focality
# Uni-Focal      228
# Multi-Focal    136
# Name: count, dtype: int64
# col.unique(): ['Low' 'Intermediate' 'High']
# col.value_counts(): Risk
# Low             230
# Intermediate    102
# High             32
# Name: count, dtype: int64
# col.unique(): ['T1a' 'T1b' 'T2' 'T3a' 'T3b' 'T4a' 'T4b']
# col.value_counts(): T
# T2     138
# T3a     96
# T1a     46
# T1b     40
# T4a     20
# T3b     16
# T4b      8
# Name: count, dtype: int64
# col.unique(): ['N0' 'N1b' 'N1a']
# col.value_counts(): N
# N0     249
# N1b     93
# N1a     22
# Name: count, dtype: int64
# col.unique(): ['M0' 'M1']
# col.value_counts(): M
# M0    346
# M1     18
# Name: count, dtype: int64
# col.unique(): ['I' 'II' 'IVB' 'III' 'IVA']
# col.value_counts(): Stage
# I      314
# II      32
# IVB     11
# III      4
# IVA      3
# Name: count, dtype: int64
# col.unique(): ['Indeterminate' 'Excellent' 'Structural Incomplete'
#  'Biochemical Incomplete']
# col.value_counts(): Response
# Excellent                 189
# Structural Incomplete      91
# Indeterminate              61
# Biochemical Incomplete     23
# Name: count, dtype: int64
# col.unique(): ['No' 'Yes']
# col.value_counts(): Recurred
# No     256
# Yes    108
# Name: count, dtype: int64