In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [3]:
# Load the training data from 'data.csv'
train_data = pd.read_csv('data.csv')
train_data['tail'] = train_data['tail'].map({'no': 0, 'yes': 1})

In [4]:
train_data


Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,0,Aquari
1,cosmix xeno nebuz odbitaz,5,1,Zorblax
2,solarix glixx novum galaxum quasar,5,1,Zorblax
3,arbor insectus pesros ekos dootix nimbus,2,1,Florian
4,mermax drakos lorix epikoz deftax,4,0,Faerix
...,...,...,...,...
495,empathix sadix disgux dredax pridius afgstix e...,2,0,Emotivor
496,quasar ustron nebulax meteorn,4,0,Quixnar
497,astron xeno ceaestar astron kometa,6,1,Zorblax
498,sporzom nimbus terram terranix aviana ekos nimbub,2,1,Florian


In [5]:
# Encode the target variable 'species'
label_encoder = LabelEncoder()
train_data['species'] = label_encoder.fit_transform(train_data['species'])

In [6]:
train_data

Unnamed: 0,message,fingers,tail,species
0,pluvia arbor aquos,4,0,0
1,cosmix xeno nebuz odbitaz,5,1,9
2,solarix glixx novum galaxum quasar,5,1,9
3,arbor insectus pesros ekos dootix nimbus,2,1,4
4,mermax drakos lorix epikoz deftax,4,0,3
...,...,...,...,...
495,empathix sadix disgux dredax pridius afgstix e...,2,0,2
496,quasar ustron nebulax meteorn,4,0,7
497,astron xeno ceaestar astron kometa,6,1,9
498,sporzom nimbus terram terranix aviana ekos nimbub,2,1,4


In [7]:
# Preprocess the 'message' column using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
X_message = vectorizer.fit_transform(train_data['message'])

In [11]:
# Concatenate the TF-IDF features with 'fingers' and 'tail'
X_train = pd.concat([pd.DataFrame(X_message.toarray(), index=train_data.index), 
                     train_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

In [13]:
# Ensure all column names are strings
X_train.columns = X_train.columns.astype(str)

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '628', '629', '630', '631', '632', '633', '634', '635', 'fingers',
       'tail'],
      dtype='object', length=638)

In [16]:
# Define the target variable
y_train = train_data['species']


In [17]:
# Split the dataset into training and test sets (80% train, 20% test)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
# Train Logistic Regression model
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_split, y_train_split)

In [19]:
# Check training accuracy
print("Logistic Regression Training Accuracy:", log_clf.score(X_test_split, y_test_split))


Logistic Regression Training Accuracy: 0.86


# **Logistic Regression Model with TF-IDF and Additional Features**

### ***Step 1: Import Required Libraries***

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### S***tep 2: Load the Training Data***

In [None]:
# Load the training data from the CSV file
train_data = pd.read_csv('/kaggle/input/dataset/data.csv')

# Convert the 'tail' column to binary (no -> 0, yes -> 1)
train_data['tail'] = train_data['tail'].map({'no': 0, 'yes': 1})


### ***Step 3: Encode the Target Variable***

In [None]:
# Encode the categorical 'species' column to numeric values
label_encoder = LabelEncoder()
train_data['species'] = label_encoder.fit_transform(train_data['species'])


### ***Step 4: Preprocess Text Using TF-IDF***

In [None]:
# Vectorize the 'message' column using TF-IDF
# We limit the number of features to the top 5000 words
vectorizer = TfidfVectorizer(max_features=5000)
X_message = vectorizer.fit_transform(train_data['message'])


### ***Step 5: Combine TF-IDF with Numerical Features***

In [None]:
# Combine the TF-IDF features with the 'fingers' and 'tail' numerical columns
X_train = pd.concat([pd.DataFrame(X_message.toarray(), index=train_data.index), 
                     train_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_train.columns = X_train.columns.astype(str)


### ***Step 6: Define Target Variable and Split Data***

In [None]:
# Define the target variable (species)
y_train = train_data['species']

# Split the dataset into training and test sets (80% train, 20% test)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


### ***Step 7: Train the Logistic Regression Model***

In [None]:
# Initialize and train a Logistic Regression model
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_split, y_train_split)

# Check the training accuracy on the test split
print("Logistic Regression Training Accuracy:", log_clf.score(X_test_split, y_test_split))


### ***Step 8: Load and Process Test Data***

In [None]:
# Load the test data
test_data = pd.read_csv('/kaggle/input/dataset/test.csv')

# Convert the 'tail' column to binary (no -> 0, yes -> 1)
test_data['tail'] = test_data['tail'].map({'no': 0, 'yes': 1})

# Preprocess the 'message' column using the same vectorizer
X_test_message = vectorizer.transform(test_data['message'])


### ***Step 9: Combine Test Data Features***

In [None]:
# Combine the TF-IDF features with the 'fingers' and 'tail' numerical columns
X_test = pd.concat([pd.DataFrame(X_test_message.toarray(), index=test_data.index), 
                    test_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_test.columns = X_test.columns.astype(str)


### ***Step 10: Make Predictions and Save the Results***

In [None]:
# Make predictions on the test data
predictions = log_clf.predict(X_test)

# Add the predictions to the test dataframe
test_data['species'] = label_encoder.inverse_transform(predictions)

# Save the results to a CSV file
output_file_path = '/kaggle/working/predictions.csv'
test_data.to_csv(output_file_path, index=False)

# Print the file path to the output
print(f"Predictions saved to {output_file_path}")


In [21]:
# Load the test data from 'test.csv'
test_data = pd.read_csv('test.csv')
test_data['tail'] = test_data['tail'].map({'no': 0, 'yes': 1})

# Preprocess the test 'message' column
X_test_message = vectorizer.transform(test_data['message'])

# Concatenate the TF-IDF features with 'fingers' and 'tail'
X_test = pd.concat([pd.DataFrame(X_test_message.toarray(), index=test_data.index), 
                    test_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_test.columns = X_test.columns.astype(str)


In [22]:
# Make predictions using the trained model
predictions = log_clf.predict(X_test)


In [23]:
# Add the predictions to the test DataFrame
test_data['species'] = label_encoder.inverse_transform(predictions)


In [25]:

# Save the results to a new CSV file
output_file_path = 'predictions.csv'
test_data.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

Predictions saved to predictions.csv
