# Importing Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Importing Dataset

In [3]:
dataset=pd.read_csv('social_media.csv')

In [4]:
dataset.head()

Unnamed: 0,S.no,Tiktoker name,Tiktok name,Subscribers,Views avg.,Likes avg.,Comments avg.,Shares avg.
0,1,jypestraykids,Stray Kids,13.8M,6.4M,2.3M,50.2K,34.2K
1,2,khaby.lame,Khabane lame,149.2M,17.3M,2.3M,15.2K,8.7K
2,3,scarlettsspam2,scarlett,2.1M,17.9M,845.8K,53.9K,6.3K
3,4,addisonre,Addison Rae,88.7M,22M,906.6K,7.6K,26.2K
4,5,belindatok,Belinda,4.8M,14.2M,1.5M,14.5K,15.3K


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   S.no           1000 non-null   int64 
 1   Tiktoker name  1000 non-null   object
 2   Tiktok name    999 non-null    object
 3   Subscribers    1000 non-null   object
 4   Views avg.     1000 non-null   object
 5   Likes avg.     1000 non-null   object
 6   Comments avg.  1000 non-null   object
 7   Shares avg.    1000 non-null   object
dtypes: int64(1), object(7)
memory usage: 62.6+ KB


# Data Preprocessing and Convertion to Numerical Data

In [6]:
dataset['Subscribers'] = dataset['Subscribers'].str.replace('K', 'e3').str.replace('M', 'e6').astype(float)

dataset.dropna(inplace=True) 


In [7]:
dataset['Views avg.'] = dataset['Views avg.'].str.replace('K', 'e3').str.replace('M', 'e6').astype(float)

dataset.dropna(inplace=True) 


In [8]:
dataset['Likes avg.'] = dataset['Likes avg.'].str.replace('K', 'e3').str.replace('M', 'e6').astype(float)

dataset.dropna(inplace=True) 


In [9]:
dataset['Comments avg.'] = dataset['Comments avg.'].str.replace('K', 'e3').str.replace('M', 'e6').astype(float)

dataset.dropna(inplace=True) 


In [10]:
dataset['Shares avg.'] = dataset['Shares avg.'].str.replace('K', 'e3').str.replace('M', 'e6').astype(float)

dataset.dropna(inplace=True) 


# Addition Of Popularity as a Feature for Prediction

In [11]:
dataset['Score'] = dataset['Subscribers'] + dataset['Views avg.'] + dataset['Likes avg.'] + dataset['Comments avg.'] + dataset['Shares avg.']
Average_score = dataset['Score'].mean()
dataset['Status'] = dataset['Score'].apply(lambda x: 'High' if x >= Average_score else 'Low')

In [12]:
dataset.head()

Unnamed: 0,S.no,Tiktoker name,Tiktok name,Subscribers,Views avg.,Likes avg.,Comments avg.,Shares avg.,Score,Status
0,1,jypestraykids,Stray Kids,13800000.0,6400000.0,2300000.0,50200.0,34200.0,22584400.0,High
1,2,khaby.lame,Khabane lame,149200000.0,17300000.0,2300000.0,15200.0,8700.0,168823900.0,High
2,3,scarlettsspam2,scarlett,2100000.0,17900000.0,845800.0,53900.0,6300.0,20906000.0,High
3,4,addisonre,Addison Rae,88700000.0,22000000.0,906600.0,7600.0,26200.0,111640400.0,High
4,5,belindatok,Belinda,4800000.0,14200000.0,1500000.0,14500.0,15300.0,20529800.0,High


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   S.no           999 non-null    int64   
 1   Tiktoker name  999 non-null    object  
 2   Tiktok name    999 non-null    object  
 3   Subscribers    999 non-null    float64 
 4   Views avg.     999 non-null    float64 
 5   Likes avg.     999 non-null    float64 
 6   Comments avg.  999 non-null    float64 
 7   Shares avg.    999 non-null    float64 
 8   Popularity     999 non-null    category
dtypes: category(1), float64(5), int64(1), object(2)
memory usage: 71.3+ KB


# Identifying Dependent and Independent Variables 

In [13]:
X = dataset[['Subscribers', 'Views avg.', 'Likes avg.', 'Comments avg.', 'Shares avg.']]

In [14]:
y = dataset['Popularity']

# Splitting the dataset as train and test for model creation

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM Model Creation

In [16]:
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# Prediction using model

In [17]:
y_pred = clf.predict(X_test)

In [18]:
y_pred

array(['low', 'high', 'medium', 'medium', 'medium', 'medium', 'medium',
       'medium', 'high', 'medium', 'medium', 'medium', 'medium', 'low',
       'medium', 'medium', 'medium', 'medium', 'medium', 'medium',
       'medium', 'medium', 'medium', 'medium', 'medium', 'high', 'high',
       'low', 'medium', 'medium', 'high', 'medium', 'low', 'medium',
       'low', 'low', 'medium', 'high', 'medium', 'high', 'medium', 'high',
       'medium', 'low', 'medium', 'low', 'medium', 'medium', 'high',
       'medium', 'medium', 'medium', 'high', 'high', 'high', 'medium',
       'high', 'medium', 'high', 'medium', 'medium', 'medium', 'high',
       'low', 'medium', 'high', 'medium', 'medium', 'low', 'low',
       'medium', 'high', 'medium', 'medium', 'high', 'medium', 'high',
       'medium', 'medium', 'medium', 'high', 'low', 'medium', 'high',
       'medium', 'medium', 'low', 'medium', 'high', 'high', 'high',
       'medium', 'medium', 'medium', 'high', 'medium', 'high', 'medium',
       'low',

In [19]:
y_test

453       low
794      high
209    medium
309    medium
741    medium
        ...  
78     medium
29       high
277       low
261       low
423    medium
Name: Popularity, Length: 200, dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

# Classification Report and Accuracy Score

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       1.00      1.00      1.00        47
         low       1.00      1.00      1.00        38
      medium       1.00      1.00      1.00       115

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [21]:
ac = accuracy_score(y_pred,y_test)
print(ac)

1.0
