In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer


# Make a GET request to the IMDb website
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the required data and convert it into a dataframe
titles = []
ratings = []
for movie in soup.find_all('div', class_='lister-item-content'):
    title = movie.find('a').text
    rating = float(movie.find('strong').text)
    titles.append(title)
    ratings.append(rating)

df = pd.DataFrame({'Title': titles, 'Rating': ratings})

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(df['Title'], df['Rating'], test_size=0.2, random_state=42)

# Train a linear regression model
vectorizer = CountVectorizer().fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing dataset
y_pred = model.predict(X_test)

# Print the accuracy of the model
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))


Mean squared error: 0.04
Coefficient of determination: 0.23
