In [1]:
import pandas as pd

# Load the data without specifying the 'engine' parameter
data = pd.read_csv(r'C:\Users\Blake\Desktop\preprocessed.csv')

In [2]:
data

Unnamed: 0,Movie Name,Actor Name,Character,Quote
0,murderland,ken kushner,teddy,choose doorway start magical journey murderland
1,murderland,ken kushner,teddy,think make past scrap
2,murderland,ken kushner,teddy,let see make round two
3,murderland,ken kushner,teddy,eat console
4,murderland,ken kushner,teddy,mother
...,...,...,...,...
666262,zulu dawn,burt lancaster,col. durnford,melvill ride obstacle colour aloft
666263,zulu dawn,burt lancaster,col. durnford,god sake hold back get horse
666264,zulu dawn,burt lancaster,col. durnford,alright alright
666265,zulu dawn,burt lancaster,col. durnford,hold colour aloft mockingly zulu wearing purlo...


## SVC attempt with subsample of two actors

In [4]:
# Drop rows with missing 'Quote' values
data = data.dropna(subset=['Quote'])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Subsectioning two actors for testing
actors_of_interest = ["burt lancaster", "ken kushner"]
data_filtered = data[data['Actor Name'].isin(actors_of_interest)]

X = data_filtered['Quote']
y = data_filtered['Actor Name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply TF-IDF vectorization to the 'Quote' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train a linear SVM model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)



Accuracy: 0.7557840616966581
Classification Report:
                 precision    recall  f1-score   support

burt lancaster       0.75      0.93      0.83       249
   ken kushner       0.78      0.44      0.57       140

      accuracy                           0.76       389
     macro avg       0.77      0.69      0.70       389
  weighted avg       0.76      0.76      0.74       389



In [None]:
# Predict the 'Actor Name' column
X = data['Quote']
y = data['Actor Name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply TF-IDF vectorization to the 'Quote' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train a linear SVM model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Make predictions and generate the classification report
y_pred = model.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

## Random Forest attempt

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Subsample a fraction of your data
subsample_fraction = 0.1
data_subsampled = data.sample(frac=subsample_fraction, random_state=42)


In [17]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data_subsampled['Quote'])


In [18]:
# X Train
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, data_subsampled['Actor Name'], test_size=0.2, random_state=42)

# Instantiate and train the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=40, random_state=42, n_jobs=-1)
random_forest.fit(X_train, y_train)


In [21]:
y_pred = random_forest.predict(X_val)

# Checking results
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.11511331232177698


In [23]:
# Count actors
actor_counts = data["Actor Name"].value_counts()

print(actor_counts)


Actor Name
robert de niro       7302
gene hackman         6508
nicolas cage         5131
jack nicholson       4817
denzel washington    4590
                     ... 
maurice schutz        152
corey feldman         133
victor jory           105
julie delpy            97
chad everett           93
Name: count, Length: 668, dtype: int64


In [None]:
# SVC Attempt using subsamples

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Subsample data
subsample_fraction = 0.1
data_subsampled = data.sample(frac=subsample_fraction, random_state=42)

# Split the dataset into train and validation sets
X = data_subsampled['Quote']
y = data_subsampled['Actor Name']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation data using the same TF-IDF vectorizer
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Instantiate and train the Support Vector Machine classifier
svm_classifier = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred = svm_classifier.predict(X_val_tfidf)

# Evaluate the model using classification report
report = classification_report(y_val, y_pred)
print("Classification Report:\n", report)


Classification Report:
                         precision    recall  f1-score   support

            adam arkin       0.00      0.00      0.00        11
            adam brody       0.07      0.09      0.08        11
          adam sandler       0.00      0.00      0.00        10
          adrien brody       0.20      0.13      0.16        15
          akiko kazami       0.13      0.33      0.19        12
             al pacino       0.29      0.09      0.13        46
            alan arkin       0.00      0.00      0.00        14
          alan cumming       0.07      0.10      0.08        21
          alan marshal       0.03      0.05      0.04        20
         albert brooks       0.20      0.02      0.04        51
         albert finney       0.04      0.04      0.04        25
         alec guinness       0.15      0.13      0.14        15
         alex j. gould       0.03      0.06      0.04        17
         alfre woodard       0.00      0.00      0.00        19
           alfr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
