Formatting input and output completely refactored.

D-MBLD · Mar 25, 2023 · a3e8edb · a3e8edb
1 parent d89e7e4
commit a3e8edb
Show file tree

Hide file tree

Showing 5 changed files with 482 additions and 44 deletions.
diff --git a/core/commands.php b/core/commands.php
@@ -21,6 +21,9 @@ class commands
 	/** @var \eb\telegram\core\forum_api */
 	private $forum_api;
 
+	/** @var \eb\telegram\core\formatters */
+	private $formatters;
+
 	/**
 	* Constructor
 	*
@@ -36,6 +39,7 @@ public function __construct(\phpbb\config\config $config,
 		$this->config = $config;
 		$this->language = $language;
 		$this->forum_api = $forum_api;
+		$this->formatters = new \eb\telegram\core\formatters(/*$config, $language*/);
 	}
 
 	public function onButtonOutdated($command)
@@ -283,7 +287,7 @@ public function onShowTopic($command)
 			$topics = $this->forum_api->selectForumTopics($user_id, $command['forum_id']);
 			$topic_id = $topics[$topic_index]['topic_id'] ?? 0;
 		}
-		//Permission check needed		
+		//Permission check needed
 		$posts = $this->forum_api->selectTopicPosts($user_id, $topic_id);
 		// Page refers to the page, of the topics-list.
 		$page = $command['page'];
@@ -312,15 +316,18 @@ public function onShowTopic($command)
 				// "Title: <b>$title</b>\n";
 				$text .= $this->language->lang('EBT_TOPIC_TITLE', $title, $viewtopic_url . $topic_id) . PHP_EOL;
 				$text .= $not_approved;
-				$text .= $post['text'];
+				//Add an invisible marker, where the text could be split, if the
+				//full post is too long, to be displayed.
+				$text .= "\u{200B}\u{200B}";
+				$text .= $this->formatters->format_post_for_telegram($post['text']);
 				$readonly = !$post['reply'];
 				$first = false;
 			} else
 			{
 				// "<b>$time:</b> Reply from <b>$user</b>\n";
 				$text .= $this->language->lang('EBT_REPLY_AT_BY', $time, $user) . PHP_EOL;
 				$text .= $not_approved;
-				$text .= $post['text'];
+				$text .= $this->formatters->format_post_for_telegram($post['text']);
 			}
 			$text .= PHP_EOL . '<u>___________________________________</u>' . PHP_EOL;
 		}
@@ -395,14 +402,14 @@ public function onSaveNewTopic($command)
 			return $this->onShowPermissions($command);
 		}
 		$title = $command['title'];
-		$content = $this->format_text($command['text'], $command['entities'] ?? array());
+		$content = $this->formatters->format_input($command['text'], $command['entities'] ?? array());
 		$user = $command['user'];
 		$forum_id = $command['forum_id'];
-		$saved = $this->forum_api->insertNewPost(true, $forum_id, $title, $content, $user);
+		$topic_id = $this->forum_api->insertNewPost(true, $forum_id, $title, $content, $user);
 		//Reset chat_state to topic-display (for back commands)
 		$this->forum_api->store_telegram_chat_state($command['chat_id'], 0, 'T');
 
-		if ($saved)
+		if ($topic_id)
 		{
 			// The following post was saved.
 			$text = $this->language->lang('EBT_TOPIC_SAVED') . PHP_EOL;
@@ -411,7 +418,9 @@ public function onSaveNewTopic($command)
 			$text .= $this->language->lang('EBT_TOPIC_TITLE', $title, '') . PHP_EOL;
 			$text .= $content;
 			$buttons = array($this->language->lang('EBT_BACK') => 'allForumTopics');
-			return [$text, $buttons];
+
+			$command['topic_id'] = $topic_id;
+			return $this->onShowTopic($command);
 		} else
 		{
 			return $this->errorOnSave();
@@ -426,16 +435,18 @@ public function onSaveNewPost(&$command)
 		{
 			return $this->onShowPermissions($command);
 		}
-		$command['text'] = $this->format_text($command['text'], $command['entities'] ?? array());
+		$command['text'] = $this->formatters->format_input($command['text'], $command['entities'] ?? array());
 		$saved = $this->forum_api->insertNewPost(false, $command['forum_id'], $command['topic_id'], $command['text'], $command['user']);
 		//Reset chat_state to topic-display (for back commands)
 		$this->forum_api->store_telegram_chat_state($command['chat_id'], 0, 'T');
-		if ($saved === true)
+		if ($saved)
 		{
 			return $this->onShowTopic($command);
 		} else
 		{
-			$command['admin_info'] = $saved;
+			$topic_id = $command['topic_id'];
+			$username = $command['user']['username'];
+			$command['admin_info'] = "New post for topic $topic_id could not be saved, by user $username";
 			return $this->errorOnSave();
 		}
 	}

diff --git a/core/formatters.php b/core/formatters.php
@@ -0,0 +1,275 @@
+<?php
+/**
+ *
+ * Telegram Bridge. An extension for the phpBB Forum Software package.
+ *
+ * @copyright (c) 2022, Edgar Bolender
+ * @license GNU General Public License, version 2 (GPL-2.0)
+ *
+ */
+
+namespace eb\telegram\core;
+
+/** Create the text and buttons response depending on the state and the users
+ * input.
+ * If an action like saving a post or saving the state is involved, this is
+ * also done here.*/
+class formatters
+{
+
+	/*
+	public function __construct(\phpbb\config\config $config,
+								\phpbb\language\language $language,
+								\eb\telegram\core\forum_api $forum_api
+								)
+	{
+		$this->config = $config;
+		$this->language = $language;
+	}
+	*/
+
+	/** Format a phpbb-post, such that the formatting information
+	 * is transformed into valid telegram formatting.
+	 */
+	public function format_post_for_telegram($text) {
+		$ent = ENT_SUBSTITUTE | ENT_HTML401; //Don't substitute quotes
+		//The telegram bot only allows a predefined set of HTLM-Tags.
+		//The forum posts however surround each opening BBCode with an <s>-Tag (and each closing with an <e>)
+
+		//At first we remove all these tags and their BBCodes, we do not want to show
+		//up in telegram.
+		$bbcode_pattern = '~<s>\[((i|b|u|url|email)(?:(?:=|\s).*)?)\]</s>(.*?)<e>\[/\\2]</e>~';
+		do
+		{
+			$text = preg_replace($bbcode_pattern, '$3', $text, 1, $count);
+		} while ($count > 0);
+
+		$allowed_tags = ['url', 'img', 'b', 'strong', 'i', 'em', 'u', 'ins', 'strike', 'del', 'a', 'code', 'pre'];
+
+		//Now remove all tags, which we do not allow. (Exception list contains all allowed tags.)
+		//This would not be necessary for "normal" text. But when the forum sends notifications,
+		//a lot of additional tags are surrounding the text, BB-Codes, links etc.
+		//$text = strip_tags($text, $allowed_tags); //Needs php version >= 7.4
+		$text = strip_tags($text, '<' . implode('><',$allowed_tags) . '>');
+
+		//Special handling of the url tag: Replace <URL url=...> by <a href=...>
+		$url_pattern = '~<URL url="([^"]*)">(.*?)</URL>~is';
+		$text = preg_replace($url_pattern, '<a href="$1">$2</a>', $text);
+
+		//Special handling of the img tag:
+		//Replace the content of the anchor tag with a <IMAGE>-Placeholder
+		$img_pattern = '~<IMG src="([^"]*)">\[img](.*?)\[/img]</IMG>~is';
+		$text = preg_replace($img_pattern, '<a href="$1">&lt;&lt;IMAGE&gt;&gt;</a>', $text);
+
+		//Add server address to relative links (no http-protokoll) starting with a slash (/)
+		$rel_url_pattern = '~<a href="((?!https?://)/[^"]*)">(.*?)</a>~is';
+		$replace = '<a href="' . generate_board_url(true) . '$1">$2</a>';
+		$text = preg_replace($rel_url_pattern, $replace, $text);
+
+		//Add server address to relative links (no http-protokoll) NOT starting with a slash (/)
+		$rel_url_pattern = '~<a href="((?!https?://)[^"]*)">(.*?)</a>~is';
+		$replace = '<a href="' . generate_board_url(false) . '/$1">$2</a>';
+		$text = preg_replace($rel_url_pattern, $replace, $text);
+
+		//Add a non printable space (ZWSP) to all forward slashes.
+		//By that, telegram does not treat the forward slash as the beginning of a command.
+		//Exclude double // and slashes belonging to html-tags
+		$text = preg_replace('~([^<]/)([^/])~', "$1\u{200B}$2", $text);
+
+		//Revert this, for all hrefs in anchors.
+		//By that, telegram does not treat the forward slash as the beginning of a command.
+		do
+		{
+			$text = preg_replace("~(<a href=\"[^\"]*/)\u{200B}([^\"]*\">)~", '$1$2', $text, 1, $count);
+		} while ($count > 0);
+		return $text;
+	}
+
+	/** Implements the substring function (without length-param) 
+	 * such that html-tags are still correctly opened and closed.
+	 * Cut the beginning of a text at the given offset.
+	 * If the offset happens to lay inside a tagged area,
+	 * the possibly cut off start tags are added again to the beginning of the
+	 * text, such that the tags are still opened and closed correctly.
+	 * If the offset would cut a start tag (before the ending >) or
+	 * an end tag into pieces, the $offset is moved behind the closing >.
+	 * By adding the start-tags before the offset, the text-length is increased.
+	 * To ensure, the text is not longer, than it would be expected by the offset,
+	 * the offset is increased, until the total lenght is less or equal
+	 * than mb_strlen($text) - $offset. 
+	 */
+	public function tag_aware_substr($tagged_text, $offset) {
+		$tags = $this->parse_tags($tagged_text);
+		$prefix = '';
+		$cut_point = $offset - 1;
+		do
+		{
+			$cut_point++;
+			$prefix = $this->adapt_cut_point($tags, $cut_point);
+
+		} while ($cut_point - mb_strlen($prefix) < $offset);
+		return $prefix . mb_substr($tagged_text, $cut_point);
+	}
+
+	/** For a text, which is to be cut, adapt the cut-point, such
+	 * that it does not cut a start or end tag of the text, and
+	 * return the start-tags, which must be added as prefix, because the
+	 * cut text still contains the corresponding end tags.
+	 */
+	public function adapt_cut_point($tags, &$offset) {
+		$print = array();
+		//Move the offset, such that no tag is split
+		foreach ($tags as $tag) {
+			if ($offset <= $tag['full_s'] || $offset >= $tag['full_e'])
+			{
+				continue; //We are outside the enclosing tags.
+			}
+			if ($offset < $tag['full_s'] + mb_strlen($tag['s_tag']))
+			{
+				//Start tag would be cut. Move offset behind start-tag
+				$start_tag = $tag['s_tag'];
+				$offset = $tag['full_s'] + mb_strlen($start_tag);
+				break; //Tags are sequential. Cannot happen again.
+			}
+			if ($offset >= $tag['full_e'] - mb_strlen($tag['e_tag']))
+			{
+				//End tag would be cut, move $offset behind the end-tag.
+				//A pure endtag without previous content also does not make sense,
+				//therefore, this is also skipped. (>= in the condition above)
+				$offset = $tag['full_e'];
+				break; //Tags are sequential. Cannot happen again.
+			}
+		}
+		//Collect the start tags, which must be added to the beginning, such that
+		//their is no end-tag with missing start tag	
+		$start_tags = array();
+		foreach ($tags as $tag) {
+			if ($offset > $tag['full_s'] && $offset < $tag['full_e'])
+			{
+				$start_tag = $tag['s_tag'];
+				$pos = $tag['full_s']; //keep start order
+				$start_tags[$pos] = $start_tag;
+			}
+		}
+		ksort($start_tags);
+		return implode('', $start_tags);
+	}
+
+	/** Find the start and end of text enclosed in html tags.
+	 * The result is an array containing all tagged texts (alos if nested)
+	 * in the following form:
+	 * array( 
+	 *    array('full' => full tag enclosed text,
+	 *          'full_s' => offset, where full text starts
+	 *          'full_e' => offset, where full text ends
+	 *          's_tag' => complete start tag (including attributes)
+	 *          'e_tag' => complete end tag
+	 *  ))
+	 * In case of self closing tags, e_tag is empty.
+	 */
+	public function parse_tags($tagged_text) {
+		$tag_pattern = "/<([\w]+)([^>]*?)(?:([\s]*\/>)|(?:(>)(?:(?:(?:[^<]*?|<\!\-\-.*?\-\->)|(?R))*)(<\/\\1[\s]*>)))/xsmu";
+		$result[] = array('full' => ' ' . $tagged_text, 'full_s' => -1);
+		for($i = 0; $i < count($result); $i++) {
+			$full = $result[$i]['full'];
+			$offset = $result[$i]['full_s'];
+			$t_count = preg_match_all($tag_pattern, mb_substr($full,1), $matches, PREG_OFFSET_CAPTURE);
+			if ($t_count) {
+				$j = 0;
+				foreach($matches[0] as $match) {
+					$mb_offset = mb_strlen(substr(mb_substr($full,1), 0, $match[1]));
+					$result[] = array(
+						'full' => $match[0],
+						'full_s' => $mb_offset + 1 + $offset,
+						'full_e' => $mb_offset + 1 + mb_strlen($match[0]) + $offset,
+						's_tag' => '<' . $matches[1][$j][0] . $matches[2][$j][0] . $matches[3][$j][0] . $matches[4][$j][0],
+						'e_tag' => $matches[5][$j][0],
+					);
+					$j++;
+				}
+			}
+		}
+		//Remove the first full text entry
+		return array_slice($result,1);
+	}
+
+	/** Format the telegram input by adding bbCodes according to the formatting information,
+	 * which telegram sends as so called entities.
+	 */
+	public function format_input($text, $entities)
+	{
+		/* Split the text, at every point where a formatting starts or ends into an array.
+		 * Therefore we collect at first all splitpoints, and remove duplicates.
+		 */
+		$split_points[] = 0;
+		foreach ($entities as $entity)
+		{
+			$split_points[] = $entity->offset;
+			$split_points[] = $entity->offset + $entity->length;
+		}
+		$split_points = array_unique($split_points);
+		rsort($split_points);
+		$chunks = array();
+		foreach ($split_points as $point)
+		{
+			$chunks[$point] = mb_substr($text, $point);
+			$text = mb_substr($text, 0, $point);
+		}
+		ksort($chunks);
+		//Sort by end of formatting, such that in case of overlapping formats, the opening tag
+		//for the format, that gets closed last is placed at first.
+		usort($entities, function($a, $b)
+						{
+							return (($a->offset + $a->length) < ($b->offset + $b->length)) ? -1 : 1;
+						});
+		foreach ($entities as $entity)
+		{
+			$bbcode = $this->get_bbcode($entity->type);
+			if (!$bbcode)
+			{
+				continue;
+			}
+			$chunks[$entity->offset] = $bbcode . $chunks[$entity->offset];
+		}
+		for ($i = count($entities) - 1; $i >= 0; $i--)
+		{
+			$entity = $entities[$i];
+			$bbcode = $this->get_bbcode($entity->type, false);
+			if (!$bbcode)
+			{
+				continue;
+			}
+			$bbcode_start = $this->get_bbcode($entity->type);
+			$end = $entity->offset + $entity->length;
+			if (strpos($chunks[$end], $bbcode_start) === 0)
+			{
+				//Remove ending tag immediatly followed by starting tag
+				$chunks[$end] = substr($chunks[$end], strlen($bbcode_start));
+			} else
+			{
+				$chunks[$end] = $bbcode . $chunks[$end];
+			}
+		}
+		//Remove non printable whitespace, which may have been included, when user copies
+		//a part of a post, where the whitespace was added. (See telegrami_api->htmlentitiesForTelegram)
+		$text = implode('', $chunks);
+		$text = str_replace("/\u{200B}", "/", $text);
+		return $text;
+	}
+
+	private function get_bbcode($format_type, $start = true)
+	{
+		switch ($format_type)
+		{
+			case 'bold': return $start ? '[b]' : '[/b]';
+			case 'italic': return $start ? '[i]' : '[/i]';
+			case 'underline': return $start ? '[u]' : '[/u]';
+			case 'code': return $start ? '[code]' : '[/code]';
+			case 'pre': return $start ? '[code]' : '[/code]';
+			case 'strikethrough': return $start ? '<del>' : '</del>';
+			case 'url': return ''; //No need for BBCode
+			default: return false;
+		}
+	}
+
+}
diff --git a/core/forum_api.php b/core/forum_api.php
@@ -520,7 +520,8 @@ public function insertNewPost($new_topic, $forum_id, $topic_id_or_title, $text,
 		{
 			$user->data[$prop] = $userOrigData[$prop];
 		}
-		return $url ? true : false;
+		//For new topics, topic_id was set in submit_post.
+		return $url ? $data['topic_id'] : false;
 	}
 
 	private function print_formatted($obj)