In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import fasttext

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the creator dataset
creator_data = pd.read_csv('/content/drive/MyDrive/Tiktok_Project/cleaned_creator_dataset.csv')

# Load the fastText model
model = fasttext.load_model('/content/drive/MyDrive/Tiktok_Project/cc.en.300.bin')

# Preprocess Text Data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and any other noise
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize the text into individual words or tokens
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

creator_data['Processed_Content'] = creator_data['Content'].apply(preprocess_text)

# Perform Keyword Extraction
def extract_keywords(tokens, num_keywords=5):
    vectorizer = CountVectorizer()
    vec = vectorizer.fit_transform([' '.join(tokens)])
    keywords = [vectorizer.get_feature_names_out()[i] for i in vec.sum(axis=0).argsort()[0, -num_keywords:]]
    return keywords

creator_data['Keywords'] = creator_data['Processed_Content'].apply(extract_keywords)

# Text Classification or Topic Modeling
def classify_content(tokens, num_topics=3):
    vectorizer = CountVectorizer()
    vec = vectorizer.fit_transform([' '.join(tokens)])
    lda = LatentDirichletAllocation(n_components=num_topics)
    lda.fit(vec)
    topics = lda.components_.argsort(axis=1)[:, -1]
    topic_keywords = [vectorizer.get_feature_names_out()[topic] for topic in topics]
    return topic_keywords

creator_data['Topics'] = creator_data['Processed_Content'].apply(classify_content)

# Calculate Content Similarity
def calculate_similarity(tokens, predefined_categories):
    content_vector = sum([model.get_word_vector(token) for token in tokens]) / len(tokens) if tokens else None
    similarity_scores = {}
    for category in predefined_categories:
        category_vector = model.get_word_vector(category)
        similarity = np.dot(content_vector, category_vector) / (np.linalg.norm(content_vector) * np.linalg.norm(category_vector))
        similarity_scores[category] = similarity
    return similarity_scores

predefined_categories = ['dance videos', 'comedy sketches', 'makeup tutorials', 'lifestyle', 'lip syncing',
                          'beatboxing', 'vlogs', 'parodies', 'magic tricks', 'video editing', 'music',
                          'reaction videos', 'beauty content']

creator_data['Similarity_Scores'] = creator_data['Processed_Content'].apply(lambda tokens: calculate_similarity(tokens, predefined_categories))

# Save the results for further use
creator_data[['Content', 'Processed_Content', 'Keywords', 'Topics', 'Similarity_Scores']].to_csv('content_analysis_results.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
!python -m spacy download en_core_web_md

2023-08-17 04:28:53.470543: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
import pandas as pd
import numpy as np
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the creator dataset
creator_data = pd.read_csv('/content/drive/MyDrive/Tiktok_Project/cleaned_creator_dataset.csv')

# Load spacy model
nlp = spacy.load("en_core_web_md")

# Preprocess text data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

creator_data['Processed_Content'] = creator_data['Content'].apply(preprocess_text)

# Get vector representation of content using spacy
def get_vector(text):
    return nlp(text).vector

creator_data['Content_Vector'] = creator_data['Processed_Content'].apply(get_vector)

# Predefined categories and their vector representations
categories = ['dance videos', 'comedy sketches', 'makeup tutorials', 'lifestyle', 'lip syncing',
                          'beatboxing', 'vlogs', 'parodies', 'magic tricks', 'video editing', 'music',
                          'reaction videos', 'beauty content']
category_vectors = {category: get_vector(category) for category in categories}

# Calculate similarity scores between each creator's content and predefined categories
similarity_scores = {}
for category, category_vector in category_vectors.items():
    similarity_scores[category] = creator_data['Content_Vector'].apply(lambda x: cosine_similarity([x], [category_vector])[0][0])

similarity_df = pd.DataFrame(similarity_scores)
similarity_df['Creator Name'] = creator_data['Creator Name']
similarity_df.set_index('Creator Name', inplace=True)

# Save the similarity scores for further use
similarity_df.to_csv('content_similarity_scores.csv')

print("Content similarity scores saved.")


Content similarity scores saved.


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model
from moviepy.editor import VideoFileClip
import cv2

creator_data = pd.read_csv('/content/drive/MyDrive/Tiktok_Project/cleaned_creator_dataset.csv')

base_model = InceptionV3(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

def extract_features_from_video(video_path):
    clip = VideoFileClip(video_path)
    frames = []
    for frame in clip.iter_frames(fps=1, dtype='uint8'):
        resized_frame = cv2.resize(frame, (299, 299))
        frames.append(resized_frame)
    frames = np.array(frames)
    frames = preprocess_input(frames)
    features = model.predict(frames)
    mean_features = np.mean(features, axis=0)
    return mean_features

def process_videos(video_paths):
    features = []
    for video_path in video_paths:
        feature = extract_features_from_video(video_path)
        features.append(feature)
    return features

creator_data['Video URL'] = creator_data['Video URL'].apply(lambda x: eval(x)) # Convert string to list
creator_data['Visual_Features'] = creator_data['Video URL'].apply(process_videos)

creator_data.to_csv('/content/drive/MyDrive/Tiktok_Project/Tiktok_Data_With_Features.csv', index=False)
































