In [13]:
pip install nltk spacy gensim scikit-learn

Collecting spacy
  Downloading spacy-3.7.5-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.5-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Col

In [1]:
import pandas as pd
import nltk
import spacy
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
spacy.cli.download("en_core_web_sm")

# Load dataset
def load_dataset(file_path):
    return pd.read_csv('titanic_data.csv')

# Clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\avira\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
# Tokenization
def tokenize(text):
    return word_tokenize(text)

# Stop-word Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Stemming
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

# Lemmatization
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

In [2]:
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model (example: Google News)
def load_pretrained_word2vec_model():
    return KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Get the vector for a word
def get_word2vec_vector(word, model):
    return model[word] if word in model else None

In [3]:
def train_word2vec_model(sentences):
    return Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

def train_model(df, text_column, label_column):
    try:
        # Clean text
        df['cleaned_text'] = df[text_column].apply(clean_text)
        print("Text cleaned successfully.")

        # Tokenization
        df['tokens'] = df['cleaned_text'].apply(tokenize)
        print("Tokenization complete.")

        # Stop-word Removal
        df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)
        print("Stop-word removal complete.")

        # Lemmatization
        df['lemmatized_tokens'] = df['filtered_tokens'].apply(lemmatize_tokens)
        print("Lemmatization complete.")

        # Join lemmatized tokens for TF-IDF
        df['lemmatized_text'] = df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))
        print("Lemmatized tokens joined.")

        # Check if there are any rows in the DataFrame
        if df.empty:
            raise ValueError("DataFrame is empty after preprocessing.")

        # Print some sample data for verification
        print("Sample preprocessed data:")
        print(df[['lemmatized_text', label_column]].head())

        # Create a pipeline with TF-IDF and Naive Bayes
        model = make_pipeline(TfidfVectorizer(), MultinomialNB())
        print("Pipeline created.")

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_text'], df[label_column], test_size=0.2, random_state=42)
        print("Train-test split complete.")

        # Train the model
        model.fit(X_train, y_train)
        print("Model training complete.")

        # Predict and evaluate
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        print("Model Accuracy:", accuracy)

        return model
    
    except Exception as e:
        print(f"An error occurred: {e}")