In [1]:
import pandas as pd
import numpy as np
import _pickle as pkl
from sklearn.model_selection import train_test_split
import re
import os
from bs4 import BeautifulSoup
from collections import Counter
from string import punctuation
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk import pos_tag, word_tokenize

# Test and Train

In [2]:
df = pd.read_csv('./datasets/train.csv')
X = df.loc[:, 'Page content'].to_numpy()
y = df.loc[:,'Popularity'].to_numpy()

### Single Features Extractions 

In [None]:
def feature_selection_author(text):
    feature_str = ""
    soup = BeautifulSoup(text, 'html.parser')
    # Author formate may various from news website
    author = soup.find("span", {"class": "author_name"}) # default format
    if(author != None): 
        feature_str += author.text.replace("By ", "") + " "
    elif(soup.find("span") != None): 
        feature_str += soup.find("span").text + " "
    elif(soup.find("a") != None): 
        feature_str += soup.find("a").text + " "
    feature_str = re.sub(r'[\W]+', ' ', feature_str.lower())
    return feature_str

def feature_selection_titles(text):
    feature_str = ""
    soup = BeautifulSoup(text, 'html.parser')
    feature_str += soup.find("h1", {"class": "title"}).text + " "
    feature_str = re.sub(r'[\W]+', ' ', feature_str.lower())
    return feature_str

def feature_selection_channels(text):
    feature_str = ""
    soup = BeautifulSoup(text, 'html.parser')
    feature_str += soup.find("article").get("data-channel") + " "
    feature_str = re.sub(r'[\W]+', ' ', feature_str.lower())
    return feature_str

def feature_selection_topic(text):
    feature_str = ""
    soup = BeautifulSoup(text, 'html.parser')
    feature_str += soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").replace(",", "")
    feature_str = re.sub(r'[\W]+', ' ', feature_str.lower())
    return feature_str

def extract_weekend(text):
    soup = BeautifulSoup(text, 'html.parser')
    ret = 0
    if soup.time.text == None or soup.time.text == "":
        ret = 0
    else:
        if soup.time.get("datetime")[:3] in ["Sat", "Sun"]:
            ret = 1 
        else:
            ret = 0
    return ret

def extract_word_count(text):
    soup = BeautifulSoup(text, 'html.parser')
    # Word Count
    text_p = (''.join(s.findAll(string=True))for s in soup.findAll('p'))
    c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
    text_div = (''.join(s.findAll(string=True))for s in soup.findAll('div'))
    c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
    total = c_div + c_p
    return len(list(total.elements()))

def extract_media_count(text):
    soup = BeautifulSoup(text, 'html.parser')
    section = soup.find("section", {"class": "article-content"})
    img_count = len(section.find_all("img")) + len(section.find_all("picture")) + len(section.find_all("figure"))
    video_count = len(section.find_all("video")) + len(section.find_all("iframe"))
    media_count = img_count + video_count
    return media_count

def extract_appealing_count(text):
    soup = BeautifulSoup(text, 'html.parser')
    section = soup.find("section", {"class": "article-content"})
    link_count = len(section.find_all("a"))
    strong_count = len(section.find_all("strong"))
    appealing_count = link_count + strong_count
    return appealing_count

print(feature_selection_topic(X[4]))

### Tokenizer

In [3]:
POSITIVE_WORDS = os.path.join(os.getcwd(), 'datasets', 'positive-words.txt')
NEGATIVE_WORDS = os.path.join(os.getcwd(), 'datasets', 'negative-words.txt')
pos_words = []
neg_words = []


for line in open(POSITIVE_WORDS, 'r').readlines()[35:]:
    word = line.rstrip()
    pos_words.append(word)

for line in open(NEGATIVE_WORDS, 'r').readlines()[35:]:
    word = line.rstrip()
    neg_words.append(word)

In [4]:
stop = stopwords.words('english')

def tokenizer_stem_neg(text):
    stemmer = LancasterStemmer()
    return [stemmer.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w in neg_words and re.match('[a-zA-Z]+', w)]

def tokenizer_stem_pos(text):
    stemmer = LancasterStemmer()
    return [stemmer.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w in pos_words and re.match('[a-zA-Z]+', w)]
def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

### Out of mem 

In [None]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk
batch_size = 1024
iters = int((27643+batch_size-1)/(batch_size*2))

In [None]:
stream = get_stream(path='./datasets/train.csv', size=batch_size)
idx = 0
X_train, y_train = None, None
batch_X, batch_y = None, None
for z in range(iters):
    batch = next(stream)
    if(idx==0):
        X_train= batch['Page content']
        y_train = batch['Popularity']
        idx+=1
    else:
        X_train = np.concatenate((X_train, batch['Page content']))
        y_train = np.concatenate((y_train, batch['Popularity']))
    batch = next(stream)

### Tfidf

In [None]:
tfidf_author = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_author,
                        tokenizer=tokenizer_stem_nostop)

tfidf_channel = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_channels,
                        tokenizer=tokenizer_stem_nostop)

tfidf_title = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_titles,
                        tokenizer=tokenizer_stem_nostop)

tfidf_topic = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_topic,
                        tokenizer=tokenizer_stem_nostop)
# tfidf_author.fit(X_train)
# #tfidf_channel.fit(X_train)
# tfidf_title.fit(X_train)
# tfidf_topic.fit(X_train)

### hash

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

# hash words to 1024 buckets
hashvec_author = HashingVectorizer(n_features=2**10,
                            preprocessor=feature_selection_author,
                            tokenizer=tokenizer_stem_nostop)

hashvec_channel = HashingVectorizer(n_features=2**10,
                            preprocessor=feature_selection_channels,
                            tokenizer=tokenizer_stem_nostop)

hashvec_title = HashingVectorizer(n_features=2**10,
                            preprocessor=feature_selection_titles,
                            tokenizer=tokenizer_stem_nostop)

hashvec_topic = HashingVectorizer(n_features=2**10,
                            preprocessor=feature_selection_topic,
                            tokenizer=tokenizer_stem_nostop)

### Combine Feature Function


In [None]:
def combine(x_batch):
    x_batch_channel = ""
    x_batch_title = tfidf_title.transform(x_batch).toarray()
    #print(x_batch_title.shape)
    x_batch_author = tfidf_author.transform(x_batch).toarray()
    #print(x_batch_author.shape)
    x_batch_topic = tfidf_topic.transform(x_batch).toarray()
    #print(x_batch_topic.shape)
    # x_batch_channel = tfidf_channel.transform(x_batch).toarray()
    #print(x_batch_channel.shape)
    weekend_vectorized = np.vectorize(extract_weekend)
    x_batch_weekend = weekend_vectorized(x_batch).reshape(len(x_batch),1)
    #print(x_batch_weekend.shape)
    media_vectorized = np.vectorize(extract_media_count)
    x_batch_media = media_vectorized(x_batch).reshape(len(x_batch),1)
    #print(x_batch_media.shape)
    appealing_count_vectorized = np.vectorize(extract_appealing_count)
    x_batch_appealing = appealing_count_vectorized(x_batch).reshape(len(x_batch),1)
    #print(x_batch_appealing.shape)
    word_vectorized = np.vectorize(extract_word_count)
    x_batch_word = word_vectorized(x_batch).reshape(len(x_batch),1)
    #print(x_batch_word.shape)
    # combined_x_batch = np.concatenate((x_batch_title, x_batch_author, x_batch_channel, x_batch_topic),axis=1)
    # , x_batch_weekend
    #                            , x_batch_media, x_batch_appealing, x_batch_word
    #print(combined_x_batch.shape)
    return x_batch_title, x_batch_author, x_batch_channel, x_batch_topic, x_batch_weekend, x_batch_media, x_batch_appealing, x_batch_word

### Features Selection


In [5]:
def feature_selection_part1(data):
    feature_str = ""
    soup = BeautifulSoup(data, 'html.parser')

    # Title
    feature_str += soup.find("h1", {"class": "title"}).text + " "

    # Channel
    feature_str += soup.find("article").get("data-channel") + " "

    # Author
    author_re = r'(?:By\s|by\s)?([a-zA-Z]+(\s[A-Z][a-z]+)*)'
    if soup.head.find("span") == None:
        feature_str += " "
    else:
        author = re.search(author_re, soup.head.find("span").text).group(1)
        feature_str += author.split(" ")[-1] + " "

    section = soup.find("section", {"class": "article-content"})
    paragraph = section.find_all("p")
    pos_count = 0
    for tag in paragraph:
        pos_count += len(tokenizer_stem_pos(tag.text))
        for word in tokenizer_stem_pos(tag.text):
            feature_str += word + " "

    neg_count = 0
    for tag in paragraph:
        neg_count += len(tokenizer_stem_neg(tag.text))
        for word in tokenizer_stem_neg(tag.text):
            feature_str += word + " "

    # Related Topics
    feature_str += soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").replace(",", "")
    
    feature_str = re.sub(r'[.:\']', '', feature_str.lower())
    return feature_str
print(feature_selection_part1(X[0]))

nasas grand challenge stop asteroids from destroying earth world moskowitz improv pref confid win support benefit valu integr win support import accompl innov tim kil dang threats asteroid asteroids challenge earth space us world 


In [6]:
def feature_selection_part2(data):
    X = []
    idx=0
    author_score =dict()
    for html in data:
        soup = BeautifulSoup(html, 'html.parser')
        # Author
        author_re = r'(?:By\s|by\s)?([a-zA-Z]+(\s[A-Z][a-z]+)*)'
        if soup.head.find("span") == None:
            continue
        else:
            author = re.search(author_re, soup.head.find("span").text).group(1)
            if author in author_score:
                author_score[author]+=int(y[idx])
            else : 
                author_score[author] = 0
                author_score[author]+=int(y[idx])
        idx+=1
    for html in data:
        soup = BeautifulSoup(html, 'html.parser')
        feature_list = []
        # Author score
        author_re = r'(?:By\s|by\s)?([a-zA-Z]+(\s[A-Z][a-z]+)*)'
        if soup.head.find("span") == None:
            feature_list.append(0)
        else:
            author = re.search(author_re, soup.head.find("span").text).group(1)
            feature_list.append(author_score[author])
            print(author_score[author])

        # word count of title
        feature_list.append(len(soup.find("h1", {"class": "title"}).text))
        # average word length and unique word rate
        words = re.findall(r'\w+', soup.get_text().lower())
        if words:
            total_words = len(words)
            unique_words = set(words)
            unique_word_count = len(unique_words)
            total_length = sum(len(word) for word in words)
            unique_rate = unique_word_count / total_words
            average_length = total_length / len(words)
            feature_list.append(unique_rate)
            feature_list.append(average_length)
        
        # Time
        if soup.time.text == None or soup.time.text == "":
            feature_list.append(0)
            feature_list.append(0)
        else:
            month = int(re.search(r'(\d+-\d+-\d+)', soup.time.text).group(1).split("-")[1])
            feature_list.append(month)
            hour = int(re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1)[:5].split(":")[0])
            minute = int(re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1)[:5].split(":")[1])
            feature_list.append(hour * 60 + minute)

        # Weekend
        if soup.time.text == None or soup.time.text == "":
            feature_list.append(0)
        else:
            feature_list.append(1 if soup.time.get("datetime")[:3] in ["Sat", "Sun"] else 0)

        # Word Count
        text_p = (''.join(s.findAll(string=True))for s in soup.findAll('p'))
        c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
        text_div = (''.join(s.findAll(string=True))for s in soup.findAll('div'))
        c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
        total = c_div + c_p
        feature_list.append(len(list(total.elements())))

        section = soup.find("section", {"class": "article-content"})

        # Video + Image count
        img_count = len(section.find_all("img")) + len(section.find_all("picture")) + len(section.find_all("figure"))
        video_count = len(section.find_all("video")) + len(section.find_all("iframe"))
        media_count = img_count + video_count
        feature_list.append(media_count)

        # Appealing count
        link_count = len(section.find_all("a"))
        strong_count = len(section.find_all("strong"))
        appealing_count = link_count + strong_count
        feature_list.append(appealing_count)

        # POS & NEG count
        paragraph = section.find_all("p")
        pos_count = 0
        neg_count = 0
        q_count = 0
        ex_count = 0
        for tag in paragraph:
            pos_count += len(tokenizer_stem_pos(tag.text))
            neg_count += len(tokenizer_stem_neg(tag.text))
            if tag.text.find("?") != -1:
                q_count += tag.text.find("?")
            if tag.text.find("!") != -1:
                ex_count += tag.text.find("!")
        feature_list.append(pos_count)    
        feature_list.append(neg_count)
        feature_list.append(q_count)
        feature_list.append(ex_count)

        X.append(feature_list)
    return np.array(X)
feature_selection_part2(X[0:100])

-1
2
0
0
-1
-1
0
-1
1
-2
-1
2
-1
0
0
-1
1
2
1
0
3
-1
-2
1
-1
1
0
-2
1
-1
1
3
-1
3
2
1
1
1
3
1
-1
-1
0
1
0
1
0
-1
1
0
-1
3
-2
-1
1
0
-1
1
1
1
-1
-1
-1
0
-2
1
-1
0
0
1
-1
-1
-1
1
0
-1
-1
1
-1
-1
1
-1
-1
1
-1
1
2
-1
0
-2
1
1
2
0
3
1
0
0
1


array([[-1.00000000e+00,  6.00000000e+01,  4.89731438e-01, ...,
         3.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.00000000e+00,  7.40000000e+01,  4.74719101e-01, ...,
         3.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  6.80000000e+01,  4.98765432e-01, ...,
         2.10000000e+01,  7.05000000e+02,  3.80000000e+01],
       ...,
       [ 0.00000000e+00,  4.10000000e+01,  4.49786325e-01, ...,
         1.40000000e+01,  7.85000000e+02,  5.24000000e+02],
       [ 0.00000000e+00,  5.90000000e+01,  5.12362637e-01, ...,
         3.00000000e+00,  5.60000000e+01,  0.00000000e+00],
       [ 1.00000000e+00,  5.80000000e+01,  6.45522388e-01, ...,
         0.00000000e+00,  2.05000000e+02,  0.00000000e+00]])

In [7]:
def feature_selection_part3(data):
    feature_str = ""
    soup = BeautifulSoup(data, 'html.parser')
    section = soup.find("section", {"class": "article-content"})
    paragraph = section.find_all("p")
    first = 0
    last =""
    last_2=""
    for tag in paragraph:
        if(not first):
            feature_str += tag.text + " "
            first = 1
        else:
            last_2=last
            last = tag.text
    if(last_2):feature_str += last_2 + " "
    feature_str = re.sub(r'[.:\',$()`]', '', feature_str.lower())
    return feature_str

print(feature_selection_part3(X[0]))
    

there may be killer asteroids headed for earth and nasa has decided to do something about it the space agency announced a new "grand challenge" on june 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet see also how it works nasa asteroid-captureresponses to the request for information which also seeks ideas for detecting and mitigating asteroid threats are due july 18the asteroid-retrieval mission designed to provide the first deep-space mission for astronauts flying on nasas space launch system rocket and orion space capsule under development has come under fire from lawmakers who would prefer that nasa return to the moona draft nasa authorization bill from the house space subcommittee which is currently in debate would cancel the mission and steer the agency toward other projects that bill will be discussed during a hearing wednesday june 19 at 10 am edtsee also how it works nasa asteroid-capture mission in picturesbut nasa officials defe

In [8]:
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier

### Random + GBC

#### training vectorizer

In [14]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=.2)
print('Begin vectorizer Training...')

tfidf = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_part1,
                        tokenizer=tokenizer_stem_nostop)
tfidf.fit(X)

tfidf_type = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_part3,
                        tokenizer=tokenizer_stem_nostop)
tfidf_type.fit(X)

Begin vectorizer Training...


#### training model

In [15]:
print('Begin Type Training...')
type_model = SGDClassifier(loss='log_loss', max_iter=200)
x_train_type = tfidf_type.transform(X_train)
y_train = LabelEncoder().fit_transform(Y_train)
type_model.fit(x_train_type, y_train)

print('Begin Word Training...')
word_model = RandomForestClassifier()
x_train_word = tfidf.transform(X_train)
y_train = LabelEncoder().fit_transform(Y_train)
word_model.fit(x_train_word, y_train)

print('Begin Stats Training...')
stats_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
x_train_stats = feature_selection_part2(X_train)
stats_model.fit(x_train_stats, y_train)

Begin Type Training...
Begin Word Training...
Begin Stats Training...


#### evaluation

In [16]:

print('Begin Evaluation...')
y_valid = LabelEncoder().fit_transform(Y_valid)

y_pred_type_train = type_model.predict_proba(x_train_type)[:,1]
y_pred_word_train = word_model.predict_proba(x_train_word)[:,1]
y_pred_stats_train = stats_model.predict_proba(x_train_stats)[:,1]

y_pred_type_valid = type_model.predict_proba(tfidf_type.transform(X_valid))[:,1]
y_pred_word_valid = word_model.predict_proba(tfidf.transform(X_valid))[:,1]
y_pred_stats_valid = stats_model.predict_proba(feature_selection_part2(X_valid))[:,1]

print("Type Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_type_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_type_valid)}')

print("Word Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_word_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_word_valid)}')

print("Stats Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_stats_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_stats_valid)}')

Begin Evaluation...
Type Model
Train score: 0.6221211501136162
Valid score: 0.521331222692272
Word Model
Train score: 0.9999999959097893
Valid score: 0.5413205778338172
Stats Model
Train score: 0.5943545781500547
Valid score: 0.5632246327000101


In [53]:
from sklearn import svm
print('Begin Type Training...')

x_train_type = tfidf_type.transform(X_train)
y_train = LabelEncoder().fit_transform(Y_train)
type_model.fit(x_train_type, y_train)
y_pred_type_train = type_model.predict_proba(x_train_type)[:,1]
y_pred_type_valid = type_model.predict_proba(tfidf_type.transform(X_valid))[:,1]
print("Type Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_type_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_type_valid)}')
#0.5431

Begin Type Training...
Type Model
Train score: 0.9450321109362028
Valid score: 0.543208698145595


#### find best para

In [34]:
train_score = []
valid_score = []
best_coef = []
best_train = 0
best_valid = 0
coef_ = np.linspace(0,.5,21)
print('Find best para...')
for i in coef_:
    for j in coef_:
        train_score = roc_auc_score(y_train, y_pred_word_train*(i) + y_pred_stats_train*(1-i) + y_pred_type_train*(1-i-j))
        valid_score = roc_auc_score(y_valid, y_pred_word_valid*(i) + y_pred_stats_valid*(1-i) + y_pred_type_valid*(1-i-j))
        if valid_score > best_valid:
            best_valid = valid_score
            best_train = train_score
            best_coef = [i,j,1-i-j]

print(f'Train score: {best_train}')
print(f'Valid score: {best_valid}')

Find best para...
Train score: 0.9907529577459627
Valid score: 0.5756814738873636


### back up model

In [18]:
pkl.dump(word_model, open("output/backup_word.pickle", "wb"))
pkl.dump(stats_model, open("output/backup_stats.pickle", "wb"))
pkl.dump(tfidf, open('output/tfidf_part1.pickle', "wb"))
pkl.dump(tfidf_type, open('output/tfidf_first_paragraph.pickle', "wb"))
pkl.dump(tfidf_type, open('output/backup_topic.pickle', "wb"))

### Average weight error eurve

In [None]:
import os
if not os.path.exists('output'):
    os.mkdir('output')

import matplotlib.pyplot as plt

plt.plot(range(1, len(train_auc)+1), train_auc, color='blue', label='Train auc')
plt.plot(range(1, len(train_auc)+1), val_auc, color='red', label='Val auc')
plt.legend(loc="best")
plt.xlabel('#Batches')
plt.ylabel('Auc')
plt.tight_layout()
plt.show()

### Prediction

In [19]:
df = pd.read_csv('./datasets/test.csv')
print("Content:\n",df.loc[0])

Id = df.loc[:, 'Id'].to_numpy()
X_test = df.loc[:, 'Page content'].to_numpy()
print(X_test.shape)
print("\nId:\n", Id[0])
print("Content:\n", X_test[0])

Content:
 Id                                                          27643
Page content    <html><head><div class="article-info"><span cl...
Name: 0, dtype: object
(11847,)

Id:
 27643
Content:
 <html><head><div class="article-info"><span class="byline "><a href="/author/sam-laird/"><img alt="2016%2f09%2f15%2f63%2fhttpsd2mhye01h4nj2n.cloudfront.netmediazgkymde1lza2.9814b" class="author_image" src="http://i.amz.mshcdn.com/-qaMPB8aiQeIaoBhqlU0OLjA07A=/90x90/2016%2F09%2F15%2F63%2Fhttpsd2mhye01h4nj2n.cloudfront.netmediaZgkyMDE1LzA2.9814b.jpg"/></a><span class="author_name">By <a href="/author/sam-laird/">Sam Laird</a></span><time datetime="Mon, 09 Sep 2013 19:47:02 +0000">2013-09-09 19:47:02 UTC</time></span></div></head><body><h1 class="title">Soccer Star Gets Twitter Death Threats After Tackling One Direction Member</h1><figure class="article-image"></figure><article data-channel="entertainment"><section class="article-content"> <div class="shift-to-hero"> <p><iframe allowfullscreen="" 

In [20]:
x_test_type = tfidf_type.transform(X_test)
x_test_word = tfidf.transform(X_test)
x_test_stats = feature_selection_part2(X_test)

y_pred_type_test = type_model.predict_proba(x_test_type)[:,1]
y_pred_word_test = word_model.predict_proba(x_test_word)[:,1]
y_pred_stats_test = stats_model.predict_proba(x_test_stats)[:,1]
y_pred = np.around(y_pred_word_test*(i) + y_pred_stats_test*(j) + y_pred_type_test*(1-i-j), decimals=3)

KeyboardInterrupt: 

In [None]:
output_data = {'Id': Id, 'Popularity': y_pred}
output_dataframe = pd.DataFrame(output_data)
print(output_dataframe)

output_dataframe.to_csv("./datasets/y_pred.csv", index=None)