# **Logistic Regression Model with TF-IDF and Additional Features**

### ***Step 1: Import Required Libraries***

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### S***tep 2: Load the Training Data***

In [4]:
# Load the training data from the CSV file
train_data = pd.read_csv('/kaggle/input/dataset/data.csv')

# Convert the 'tail' column to binary (no -> 0, yes -> 1)
train_data['tail'] = train_data['tail'].map({'no': 0, 'yes': 1})


### ***Step 3: Encode the Target Variable***

In [5]:
# Encode the categorical 'species' column to numeric values
label_encoder = LabelEncoder()
train_data['species'] = label_encoder.fit_transform(train_data['species'])


### ***Step 4: Preprocess Text Using TF-IDF***

In [6]:
# Vectorize the 'message' column using TF-IDF
# We limit the number of features to the top 5000 words
vectorizer = TfidfVectorizer(max_features=5000)
X_message = vectorizer.fit_transform(train_data['message'])


### ***Step 5: Combine TF-IDF with Numerical Features***

In [7]:
# Combine the TF-IDF features with the 'fingers' and 'tail' numerical columns
X_train = pd.concat([pd.DataFrame(X_message.toarray(), index=train_data.index), 
                     train_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_train.columns = X_train.columns.astype(str)


### ***Step 6: Define Target Variable and Split Data***

In [8]:
# Define the target variable (species)
y_train = train_data['species']

# Split the dataset into training and test sets (80% train, 20% test)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


### ***Step 7: Train the Logistic Regression Model***

In [9]:
# Initialize and train a Logistic Regression model
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train_split, y_train_split)

# Check the training accuracy on the test split
print("Logistic Regression Training Accuracy:", log_clf.score(X_test_split, y_test_split))


Logistic Regression Training Accuracy: 0.86


### ***Step 8: Load and Process Test Data***

In [10]:
# Load the test data
test_data = pd.read_csv('/kaggle/input/dataset/test.csv')

# Convert the 'tail' column to binary (no -> 0, yes -> 1)
test_data['tail'] = test_data['tail'].map({'no': 0, 'yes': 1})

# Preprocess the 'message' column using the same vectorizer
X_test_message = vectorizer.transform(test_data['message'])


### ***Step 9: Combine Test Data Features***

In [11]:
# Combine the TF-IDF features with the 'fingers' and 'tail' numerical columns
X_test = pd.concat([pd.DataFrame(X_test_message.toarray(), index=test_data.index), 
                    test_data[['fingers', 'tail']].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_test.columns = X_test.columns.astype(str)


### ***Step 10: Make Predictions and Save the Results***

In [12]:
# Make predictions on the test data
predictions = log_clf.predict(X_test)

# Add the predictions to the test dataframe
test_data['species'] = label_encoder.inverse_transform(predictions)

# Save the results to a CSV file
output_file_path = '/kaggle/working/predictions.csv'
test_data.to_csv(output_file_path, index=False)

# Print the file path to the output
print(f"Predictions saved to {output_file_path}")


OSError: Cannot save file into a non-existent directory: '\kaggle\working'