In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold, LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv("/content/data_stories_one_shot.csv")  # Make sure the path is correct



In [4]:
# Define a basic preprocessing function (without NLTK)
def basic_preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    stopwords = set([
        'the', 'is', 'in', 'and', 'to', 'of', 'a', 'with', 'that', 'this',
        'for', 'on', 'as', 'are', 'an', 'be', 'or', 'at', 'by', 'from', 'it', 'was', 'which'
    ])
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in stopwords])

In [5]:
# Apply preprocessing
df['Processed'] = df['Sentence'].apply(basic_preprocess)

# Prepare inputs
X_text = df['Processed']
y = df['Stage']
groups = df['Plot_Name']

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_text)

In [6]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Results storage
results = {
    'Model': [],
    'Zero-Shot CV Accuracy': [],
    'Zero-Shot Leave-One-Plot-Out Accuracy': []
}


In [7]:

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    # CV score
    cv_scores = cross_val_score(model, X, y, cv=cv)

    # Leave-One-Plot-Out score
    logo = LeaveOneGroupOut()
    logo_scores = []
    for train_idx, test_idx in logo.split(X, y, groups):
        model.fit(X[train_idx], y[train_idx])
        preds = model.predict(X[test_idx])
        logo_scores.append(accuracy_score(y[test_idx], preds))

    # Save results
    results['Model'].append(name)
    results['Zero-Shot CV Accuracy'].append(np.mean(cv_scores))
    results['Zero-Shot Leave-One-Plot-Out Accuracy'].append(np.mean(logo_scores))

# Create final results table
results_df = pd.DataFrame(results)
print(results_df.round(3))




                 Model  Zero-Shot CV Accuracy  \
0  Logistic Regression                  0.715   
1                  SVM                  0.769   
2          Naive Bayes                  0.738   
3        Random Forest                  0.746   

   Zero-Shot Leave-One-Plot-Out Accuracy  
0                                  0.662  
1                                  0.702  
2                                  0.686  
3                                  0.618  
