# imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, f1_score
from sklearn.svm import SVC

%matplotlib inline

# Load the data

In [2]:
df_copy = pd.read_csv("C:/Users/golds/Documents/Studies/hit/Year B/semester A/Introduction to Data Science/Final Project/df_cleaned1.csv")
df = df_copy.copy()

Deleting the columns that contains words

In [3]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('long_title', axis=1)
df = df.drop('main_reporter', axis=1)
df = df.drop('content', axis=1)
df = df.drop('url', axis=1)
df = df.drop('title', axis=1)


Changing the category column into numbers, so it will be easier to work with

In [4]:
category_to_number = {
    'ביטחוני': 1,
    'בעולם': 2,
    'פוליטי': 3,
    'פלילי': 4,
    'תרבות': 5,
    'כלכלה': 6
}
df['category'] = df['category'].replace(category_to_number)
df.head()

Unnamed: 0,category,num_of_reporters,views,number_of_words,interested,not_interested,publish_date_day,time_group
0,1,1,13460,403,315,31,3,3
1,1,1,6254,379,125,2,3,3
2,1,1,2649,162,53,1,3,3
3,1,2,9886,164,254,6,3,2
4,1,1,7879,264,197,7,3,2


## Linear regression

In [5]:
X = df.drop('views', axis=1)
y = df['views']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the linear regression model and fit it to the data frame
model = LinearRegression()
model.fit(X, y)

# print the coefficients
Coefficients = pd.Series(model.coef_, index=X.columns)
print(Coefficients)
print('Intercept:', model.intercept_)
y_pred = model.predict(X_test)

# calculate the R-squared value
r2 = r2_score(y_test, y_pred)

# print the R-squared value
print('R²:', r2)

category            2974.658056
num_of_reporters    3238.526979
number_of_words       38.480704
interested            34.128573
not_interested        69.229388
publish_date_day     -46.007448
time_group           294.741244
dtype: float64
Intercept: -21549.62095249157
R²: 0.624289885418597


changing the views column into 5 groups

In [6]:
df['views_group'] = pd.qcut(df['views'], q=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=[0, 1, 2, 3, 4])
df = df.drop('views', axis=1)

In [7]:
X = df.drop('views_group', axis=1)
y = df['views_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random forest classifier

In [8]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
# calculate the R-squared value
r2 = r2_score(y_test, y_pred)
# print the R-squared value
print('R²:', r2)

importances = rf.feature_importances_
feature_names = X.columns
feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances)

Accuracy: 0.6125984251968504
R²: 0.708077329613738
category            0.056153
num_of_reporters    0.020047
number_of_words     0.180500
interested          0.388664
not_interested      0.227353
publish_date_day    0.082782
time_group          0.044501
dtype: float64


## SVM

In [9]:
# Create a SVM classifier with a linear kernel
svm = SVC(kernel='linear')

# Train the SVM classifier on the training set
svm.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('R² score:', r2)

Accuracy: 0.6173228346456693
R² score: 0.7019477197368611
