In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('/Users/deadrienhill/Downloads/superheroes_nlp_dataset.csv')

# Define the features and target
X = df['history_text'].astype(str)
y = df['strength_score']

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('reg', LinearRegression())
])

# Fit the pipeline on the data
pipeline.fit(X, y)

# Test the pipeline
test_text = 'He was one of the many prisoners of Indian Hil...'
prediction = pipeline.predict([test_text])
print(prediction)

[32.76639105]


In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Load dataset
df = pd.read_csv('/Users/deadrienhill/Downloads/superheroes_nlp_dataset.csv')

# Define baseline
baseline = df.strength_score.mean()

# Define preprocessing functions
def basic_clean(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\']', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stopword_list = stopwords.words('english')
    words = [word for word in tokens if word not in stopword_list]
    return words

def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    lemms = [lemmatizer.lemmatize(word) for word in words]
    return lemms

# Preprocess history_text
df['processed_text'] = df['history_text'].astype(str).apply(basic_clean)
df['tokens'] = df['processed_text'].apply(tokenize)
df['tokens'] = df['tokens'].apply(remove_stopwords)
df['tokens'] = df['tokens'].apply(lemmatize)
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

# Split the data into training, validation, and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=123)
train, val = train_test_split(train, test_size=0.2, random_state=123)

# Vectorize the text using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['processed_text'])
X_val = vectorizer.transform(val['processed_text'])
X_test = vectorizer.transform(test['processed_text'])
y_train = train['strength_score']
y_val = val['strength_score']
y_test = test['strength_score']

# Train a linear regression model on the training set
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set and compare with baseline
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print("RMSE: ", rmse)
print("Baseline: ", baseline)



RMSE:  29.4450898987045
Baseline:  39.45862068965517
